Documentos de Académico
Documentos de Profesional
Documentos de Cultura
Table of Contents
Procesamiento y correccion de strings ................................................................................................ 2
Localizacion de errores .............................................................................................................................. 5
Imputacion ...................................................................................................................................................... 7
Supuestos ......................................................................................................................................................... 9
Codigo RMarkdown................................................................................................................................... 27
Procesamiento y corrección de strings
library(readr) archivo <-
'datosHousing.csv' datos =
read.csv(archivo) str(datos)
summary(datos)
##
## <1h oc <1h ocean <1H OCEAN in land inl and inland
INLAND
## 1561 671 6454 384 320 1310
4194
## ISLAND nea bay near bay NEAR BAY NEAR OCEAN ##
4 410 1429 340 2531
## kk
## <1h oc <1h ocean <1H OCEAN in land inl and inland
INLAND
## 1561 671 6454 384 320 1310
4194
## ISLAND nea bay near bay NEAR BAY NEAR OCEAN ##
4 410 1429 340 2531
## kk
## <1h Oc <1h Ocean In Land Inl And Inland Island N
ea Bay
## 1561 7125 384 320 5504 4
410
## Near Bay Near Ocean
## 1769 2531
#<1H OCEAN
#INLAND
#ISLAND
#NEAR BAY
#NEAR OCEAN
## kk
## <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
## 8686 6208 4 2179 2531
# Update
datos$ocean_proximity <-as.factor(kk)
Para comprobar los cambios se presenta la tabla de frecuencias.
table(datos$ocean_proximity)
##
## <1H OCEAN INLAND ISLAND NEAR BAY
NEAR OCEAN ## 8686 6208
4 2179 2531
library(ggplot2)
ggplot(datos, aes(x = (ocean_proximity))) +
geom_bar(stat = "count", color = "red", fill
= "light blue")
##
Localizacion de errores
# Estadísticas de valores
vacíos sum(is.na(datos)) ## [1]
10518 colSums(is.na(datos))
## X longitude latitude housing_media
n_age
## 0 1032 1032
1032
## total_rooms total_bedrooms population house
holds
## 1032 1230 1032
1032
## median_income median_house_value ocean_proximity ##
1032 1032 1032 colSums(datos=="")
## X longitude latitude housing_media
n_age
## 0 NA NA
NA
## total_rooms total_bedrooms population house
holds
## NA NA NA
NA
## median_income median_house_value
ocean_proximity ## NA
NA NA
longitude
longitude <- datos[datos$longitude == "",]
dim(longitude) ## [1] 1032 11
boxplot(datos$longitude)
Corrección de errores
boxplot.stats(datos$longitude)$out
## numeric(0)
boxplot.stats(datos$latitude)$out
## numeric(0)
boxplot.stats(datos$housing_median_age)$out
## integer(0)
boxplot.stats(datos$total_rooms)$out
length(boxplot.stats(datos$total_rooms)$out)
## [1] 1233 boxplot.stats(datos$total_bedrooms)$out
length(boxplot.stats(datos$total_bedrooms)$out)
datosN = datos[,2:10]
Mahal = mahalanobis(datosN, colMeans(datosN, na.rm = T),
cov(datosN, use='pairwise.complete'))
Mahal
Imputación
library(VIM)
datos1 = kNN(SinAtipicos) datos1
FALSE
## total_bedrooms_imp population_imp households_imp median_income_im
p
## 1 FALSE FALSE FALSE FALS
E
## 2 FALSE TRUE FALSE ##
[ reached 'max' / getOption("max.print") -- omitted 15603 rows ]
sum(is.na(datosN))
## [1] 9486
sum(is.na(datos1))
## [1] 0
numerica = datos1[,2:10]
random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica)
standardized = rstudent(fake)
qqnorm(standardized) abline(0,1)
hist(datos1$longitude)
ks.test(datos1$longitude,"pnorm",mean = mean
(datos1$longitude ),sd=sd(datos1$longitude ))
## Warning in ks.test(datos1$longitude, "pnorm", mean = mean(datos1$longi
tude), :
## ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$longitude
## D = 0.20615, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$housing_median_age)
ks.test(datos1$housing_median_age ,"pnorm",mean=mean
(datos1$housing_median_age ),sd=sd(datos1$housing_median_age ))
##
## One-sample Kolmogorov-Smirnov test ##
## data: datos1$housing_median_age
## D = 0.060003, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$total_rooms)
ks.test(datos1$total_rooms ,"pnorm",mean=mean
(datos1$total_rooms ),sd=sd(datos1$total_rooms ))
##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$total_rooms
## D = 0.1344, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$total_bedrooms)
ks.test(datos1$total_bedrooms ,"pnorm",mean=mean
(datos1$total_bedrooms ),sd=sd(datos1$total_bedrooms ))
##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$total_bedrooms
## D = 0.12855, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$population)
ks.test(datos1$population ,"pnorm",mean=mean
(datos1$population ),sd=sd(datos1$population ))
## Warning in ks.test(datos1$population, "pnorm", mean =
mean(datos1$popu lation), :
## ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$population
## D = 0.12413, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$households)
ks.test(datos1$households ,"pnorm",mean=mean
(datos1$households ),sd=sd(datos1$households ))
## Warning in ks.test(datos1$households, "pnorm", mean =
mean(datos1$hous eholds), :
## ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$households
## D = 0.1244, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$median_income)
ks.test(datos1$median_income ,"pnorm",mean=mean
(datos1$median_income ),sd=sd(datos1$median_income ))
##
## One-sample Kolmogorov-Smirnov test ##
## data: datos1$median_income
## D = 0.077547, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$median_house_value)
ks.test(datos1$median_house_value ,"pnorm",mean=mean
(datos1$median_house_value ),sd=sd(datos1$median_house_value ))
##
## One-sample Kolmogorov-Smirnov test ##
## data: datos1$median_house_value
## D = 0.10172, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
numerica = datos1[,2:10]
random=rchisq(nrow(numerica), 7) fake=lm(random
~ ., data = numerica) standardized =
rstudent(fake)
qqnorm(standardized) abline(0,1)
hist(standardized)
qqnorm(standardized) abline(0,1)
ModeloLineal = lm(longitude ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)
ModeloLineal = lm(latitude ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)
abline(0,1)
ModeloLineal = lm(total_rooms ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)
ModeloLineal = lm(total_bedrooms ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized) abline(0,1)
abline(0,1)
Se rechaza el
supuesto de linealidad
numerica = datos1[,2:10]
##
## Fligner-Killeen test of homogeneity of variances ##
## data: longitude by ocean_proximity
## Fligner-Killeen:med chi-squared = 4252.1, df = 4, p-value < 2.2e-16
##
## Fligner-Killeen test of homogeneity of variances ##
## data: latitude by ocean_proximity
## Fligner-Killeen:med chi-squared = 4406.2, df = 4, p-value < 2.2e-16
result = fligner.test(housing_median_age ~ ocean_proximity, data = datos1
)
result
##
## Fligner-Killeen test of homogeneity of variances ##
## data: housing_median_age by ocean_proximity
## Fligner-Killeen:med chi-squared = 66.461, df = 4, p-value = 1.266e-13
result = fligner.test(total_rooms ~ ocean_proximity, data = datos1)
result
##
## Fligner-Killeen test of homogeneity of variances ##
## data: total_rooms by ocean_proximity
## Fligner-Killeen:med chi-squared = 61.457, df = 4, p-value = 1.433e-12
##
## Fligner-Killeen test of homogeneity of variances ##
## data: total_bedrooms by ocean_proximity
## Fligner-Killeen:med chi-squared = 14.858, df = 4, p-value = 0.005005
##
## Fligner-Killeen test of homogeneity of variances ##
## data: population by ocean_proximity
## Fligner-Killeen:med chi-squared = 107.83, df = 4, p-value < 2.2e-16
##
## Fligner-Killeen test of homogeneity of variances ##
## data: households by ocean_proximity
## Fligner-Killeen:med chi-squared = 8.904, df = 4, p-value = 0.06355
##
## Fligner-Killeen test of homogeneity of variances ##
## data: median_income by ocean_proximity
## Fligner-Killeen:med chi-squared = 402.7, df = 4, p-value < 2.2e-16
result = fligner.test(median_house_value ~ ocean_proximity, data = datos1
)
result
##
## Fligner-Killeen test of homogeneity of variances
##
## data: median_house_value by ocean_proximity
## Fligner-Killeen:med chi-squared = 1845, df = 4, p-value < 2.2e-16
Código RMarkdown
```
```{r}
library(readr) archivo <-
'datosHousing.csv' datos =
read.csv(archivo) str(datos)
```
```{r}
summary(datos)
```
```{r}
table(datos$ocean_proximity)
```
```{r}
kk <- trimws( datos$ocean_proximity ) table(kk
)
```
```{r}
kk <- str_to_title(kk) table(kk)
```
#<1H OCEAN
#INLAND
#ISLAND
#NEAR BAY
#NEAR OCEAN
# Update
datos$ocean_proximity <-as.factor(kk)
```
Para comprobar los cambios se presenta la tabla de frecuencias.
```{r}
table(datos$ocean_proximity)
```
```{r}
library(ggplot2)
```{r}
```
## Localizacion de errores
```{r}
# Estadísticas de valores vacíos
sum(is.na(datos)) colSums(is.na(datos))
colSums(datos=="")
```
### longitude
```{r}
longitude <- datos[datos$longitude == "",] dim(longitude)
``` ```{r}
boxplot(datos$longitude)
```
## Correccion de errores
```{r}
boxplot.stats(datos$longitude)$out
```
boxplot.stats(datos$latitude)$out
``` ```{r}
boxplot.stats(datos$housing_median_age)$out
``` ```{r}
boxplot.stats(datos$total_rooms)$out
length(boxplot.stats(datos$total_rooms)$out)
``` ```{r}
boxplot.stats(datos$total_bedrooms)$out
length(boxplot.stats(datos$total_bedrooms)$out)
``` ```{r}
boxplot.stats(datos$population)$out
length(boxplot.stats(datos$population)$out)
```
```{r}
boxplot.stats(datos$households)$out
length(boxplot.stats(datos$households)$out)
``` ```{r}
boxplot.stats(datos$median_income)$out
length(boxplot.stats(datos$median_income)$out)
```
```{r}
boxplot.stats(datos$median_house_value)$out
length(boxplot.stats(datos$median_house_value)$out)
```
```{r}
datosN = datos[,2:10]
Mahal = mahalanobis(datosN, colMeans(datosN, na.rm = T),
cov(datosN, use='pairwise.complete'))
Mahal
```
```{r}
Mahal = mahalanobis(datosN, colMeans(datosN, na.rm = T), cov(datosN,
use='pairwise.complete'))
Mahal
```
```{r}
PuntajeCorte = qchisq(1-0.001, ncol(datosN)-1) # extrae un cuantil de la dist chi
cuadrada
PuntajeCorte
```
```{r}
summary(Mahal<PuntajeCorte)
```
SinAtipicos=subset(datos, Mahal<PuntajeCorte| is.na(Mahal))
SinAtipicos
```
## Imputacion
```{r}
library(VIM) datos1 =
kNN(SinAtipicos) datos1
```
```{r}
sum(is.na(datosN)) sum(is.na(datos1))
```
```{r}
par(mfrow = c(3, 3))
hist(datos1$longitude, breaks = 20, main = "longitude", border="darkorange",
col="dodgerblue")
hist(datos1$latitude, breaks = 20, main = "latitude", border="darkorange",
col="dodgerblue")
hist(datos1$housing_median_age, breaks = 20, main = "housing_median_age",
border="darkorange", col="dodgerblue")
hist(datos1$total_rooms, breaks = 20, main = "total_rooms", border="darkorange",
col="dodgerblue")
hist(datos1$total_bedrooms, breaks = 20, main = "total_bedrooms",
border="darkorange", col="dodgerblue")
```{r}
```{r}
numerica = datos1[,2:10]
random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica)
standardized = rstudent(fake)
qqnorm(standardized) abline(0,1)
```
```{r}
hist(datos1$longitude)
ks.test(datos1$longitude,"pnorm",mean = mean
(datos1$longitude ),sd=sd(datos1$longitude ))
hist(datos1$housing_median_age)
ks.test(datos1$housing_median_age ,"pnorm",mean=mean
(datos1$housing_median_age ),sd=sd(datos1$housing_median_age ))
hist(datos1$total_rooms)
ks.test(datos1$total_rooms ,"pnorm",mean=mean
(datos1$total_rooms ),sd=sd(datos1$total_rooms ))
hist(datos1$total_bedrooms)
ks.test(datos1$total_bedrooms ,"pnorm",mean=mean
(datos1$total_bedrooms ),sd=sd(datos1$total_bedrooms ))
hist(datos1$population)
ks.test(datos1$population ,"pnorm",mean=mean
(datos1$population ),sd=sd(datos1$population ))
hist(datos1$households)
ks.test(datos1$households ,"pnorm",mean=mean
(datos1$households ),sd=sd(datos1$households ))
hist(datos1$median_income)
ks.test(datos1$median_income ,"pnorm",mean=mean
(datos1$median_income ),sd=sd(datos1$median_income ))
hist(datos1$median_house_value)
ks.test(datos1$median_house_value ,"pnorm",mean=mean
(datos1$median_house_value ),sd=sd(datos1$median_house_value ))
```
```{r}
numerica = datos1[,2:10]
random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica) standardized
= rstudent(fake)
qqnorm(standardized) abline(0,1)
hist(standardized)
qqnorm(standardized) abline(0,1)
```
```{r}
ModeloLineal = lm(longitude ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)
```
Se rechaza el supuesto de linealidad
```{r}
numerica = datos1[,2:10]
random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica) standardized
= rstudent(fake)
library(stats)
result = fligner.test(longitude ~ ocean_proximity, data = datos1) result
result = fligner.test(latitude ~ ocean_proximity, data = datos1) result
```{r}
setwd("C:/Users/Usuario1/Desktop/Maestría/Modulo1/Proyecto
final/Datos_corregidos")
write.table(datos1,file = "datos_corregidos.csv",sep = "i",row.names = F)
```