Está en la página 1de 40

TRATAMIENTO DE DATOS EN R Studio

Por: Mario Orlando Suarez Ibujes


Fecha: 25/10/2023
https://orcid.org/0000-0002-3962-5433
https://scholar.google.com/citations?user=FUoyU1cAAAAJ&hl=e
http://repositorio.utn.edu.ec/handle/123456789/760

Table of Contents
Procesamiento y correccion de strings ................................................................................................ 2
Localizacion de errores .............................................................................................................................. 5
Imputacion ...................................................................................................................................................... 7
Supuestos ......................................................................................................................................................... 9
Codigo RMarkdown................................................................................................................................... 27
Procesamiento y corrección de strings
library(readr) archivo <-
'datosHousing.csv' datos =
read.csv(archivo) str(datos)

## 'data.frame': 20640 obs. of 11 variables:


## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ longitude : num -122 -122 -122 -122 -122 ...
## $ latitude : num 37.9 37.9 37.9 37.9 37.9 ...
## $ housing_median_age: int 41 21 52 52 52 52 52 NA 42 52 ...
## $ total_rooms : int 880 7099 1467 1274 1627 919 2535 NA 2555 3
549 ...
## $ total_bedrooms : int 129 1106 190 235 280 213 489 687 665 707 .
..
## $ population : int 322 NA 496 558 565 413 1094 1157 1206 1551
...
## $ households : int 126 1138 177 219 259 193 514 647 595 714 .
..
## $ median_income : num 8.33 8.3 7.26 5.64 3.85 ...
## $ median_house_value: num 452600 358500 352100 341300 342200 ... ##
$ ocean_proximity : chr "near bay" "near bay" "near bay" "near bay "
...

summary(datos)

## X longitude latitude housing_median_age


## Min. : 1 Min. :-124.3 Min. :32.54 Min. : 1.00
## 1st Qu.: 5161 1st Qu.:-121.8 1st Qu.:33.93 1st Qu.:18.00
## Median :10320 Median :-118.5 Median :34.25 Median :29.00
## Mean :10320 Mean :-119.6 Mean :35.63 Mean :28.64
## 3rd Qu.:15480 3rd Qu.:-118.0 3rd Qu.:37.72 3rd Qu.:37.00
## Max. :20640 Max. :-114.3 Max. :41.95 Max. :52.00
## NA's :1032 NA's :1032 NA's :1032 ##
total_rooms total_bedrooms population households
## Min. : 2 Min. : 1.0 Min. : 3 Min. : 1.0
## 1st Qu.: 1449 1st Qu.: 295.0 1st Qu.: 786 1st Qu.: 281.0
## Median : 2127 Median : 435.0 Median : 1166 Median : 410.0
## Mean : 2637 Mean : 536.9 Mean : 1427 Mean : 500.5
## 3rd Qu.: 3143 3rd Qu.: 646.0 3rd Qu.: 1726 3rd Qu.: 606.0
## Max. :39320 Max. :6445.0 Max. :35682 Max. :6082.0 ##
NA's :1032 NA's :1230 NA's :1032 NA's :1032
## median_income median_house_value ocean_proximity
## Min. : 0.4999 Min. : 14999 Length:20640
## 1st Qu.: 2.5634 1st Qu.:119800 Class :character
## Median : 3.5313 Median :179800 Mode :character
## Mean : 3.8713 Mean :206986
## 3rd Qu.: 4.7470 3rd Qu.:264700
## Max. :15.0001 Max. :500001 ##
NA's :1032 NA's :1032

El archivo se denomina datosHousing.csv, contiene 20640 registros y 11 variables. Estas


variables son: X, longitude, latitude, housing_median_age, total_rooms, total_bedrooms,
population, households, median_income, median_house_value, ocean_proximity
table(datos$ocean_proximity)

##
## <1h oc <1h ocean <1H OCEAN in land inl and inland
INLAND
## 1561 671 6454 384 320 1310
4194
## ISLAND nea bay near bay NEAR BAY NEAR OCEAN ##
4 410 1429 340 2531

kk <- trimws( datos$ocean_proximity ) table(kk


)

## kk
## <1h oc <1h ocean <1H OCEAN in land inl and inland
INLAND
## 1561 671 6454 384 320 1310
4194
## ISLAND nea bay near bay NEAR BAY NEAR OCEAN ##
4 410 1429 340 2531

kk <- str_to_title(kk) table(kk)

## kk
## <1h Oc <1h Ocean In Land Inl And Inland Island N
ea Bay
## 1561 7125 384 320 5504 4
410
## Near Bay Near Ocean
## 1769 2531

#<1H OCEAN
#INLAND
#ISLAND
#NEAR BAY
#NEAR OCEAN

kk <- sub("<1h Oc","<1H OCEAN",kk) kk <-


sub("<1h Ocean","<1H OCEAN",kk) kk <-
sub("<1H OCEANean", "<1H OCEAN",kk ) kk <-
sub("In Land","INLAND",kk) kk <- sub("Inl
And","INLAND",kk) kk<-
sub("Inland","INLAND",kk) kk <- sub("Nea
Bay", "NEAR BAY",kk) kk<- sub("Near Bay",
"NEAR BAY",kk) kk=toupper(kk) table(kk)

## kk
## <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
## 8686 6208 4 2179 2531

# Update

datos$ocean_proximity <-as.factor(kk)
Para comprobar los cambios se presenta la tabla de frecuencias.
table(datos$ocean_proximity)

##
## <1H OCEAN INLAND ISLAND NEAR BAY
NEAR OCEAN ## 8686 6208
4 2179 2531

library(ggplot2)
ggplot(datos, aes(x = (ocean_proximity))) +
geom_bar(stat = "count", color = "red", fill
= "light blue")
##
Localizacion de errores
# Estadísticas de valores
vacíos sum(is.na(datos)) ## [1]
10518 colSums(is.na(datos))
## X longitude latitude housing_media
n_age
## 0 1032 1032
1032
## total_rooms total_bedrooms population house
holds
## 1032 1230 1032
1032
## median_income median_house_value ocean_proximity ##
1032 1032 1032 colSums(datos=="")
## X longitude latitude housing_media
n_age
## 0 NA NA
NA
## total_rooms total_bedrooms population house
holds
## NA NA NA
NA
## median_income median_house_value
ocean_proximity ## NA
NA NA

longitude
longitude <- datos[datos$longitude == "",]
dim(longitude) ## [1] 1032 11
boxplot(datos$longitude)

Corrección de errores
boxplot.stats(datos$longitude)$out

## numeric(0)
boxplot.stats(datos$latitude)$out
## numeric(0)
boxplot.stats(datos$housing_median_age)$out

## integer(0)
boxplot.stats(datos$total_rooms)$out
length(boxplot.stats(datos$total_rooms)$out)
## [1] 1233 boxplot.stats(datos$total_bedrooms)$out

length(boxplot.stats(datos$total_bedrooms)$out)

## [1] 1208 boxplot.stats(datos$population)$out


length(boxplot.stats(datos$population)$out)
## [1] 1136 boxplot.stats(datos$households)$out
length(boxplot.stats(datos$households)$out)
## [1] 1160 boxplot.stats(datos$median_income)$out
length(boxplot.stats(datos$median_income)$out)
## [1] 643 boxplot.stats(datos$median_house_value)$out
length(boxplot.stats(datos$median_house_value)$out)
## [1] 1026

datosN = datos[,2:10]
Mahal = mahalanobis(datosN, colMeans(datosN, na.rm = T),
cov(datosN, use='pairwise.complete'))

Mahal
Imputación
library(VIM)
datos1 = kNN(SinAtipicos) datos1

FALSE
## total_bedrooms_imp population_imp households_imp median_income_im
p
## 1 FALSE FALSE FALSE FALS
E
## 2 FALSE TRUE FALSE ##
[ reached 'max' / getOption("max.print") -- omitted 15603 rows ]
sum(is.na(datosN))
## [1] 9486
sum(is.na(datos1))
## [1] 0

par(mfrow = c(3, 3))


hist(datos1$longitude, breaks = 20, main = "longitude", border="darkorang
e", col="dodgerblue")
hist(datos1$latitude, breaks = 20, main = "latitude", border="darkorange"
, col="dodgerblue")
hist(datos1$housing_median_age, breaks = 20, main = "housing_median_age",
border="darkorange", col="dodgerblue")
hist(datos1$total_rooms, breaks = 20, main = "total_rooms", border="darko
range", col="dodgerblue")
hist(datos1$total_bedrooms, breaks = 20, main = "total_bedrooms", border=
"darkorange", col="dodgerblue")
hist(datos1$population, breaks = 20, main = "population", border="darkora
nge", col="dodgerblue")
hist(datos1$households, breaks = 20, main = "households", border="darkora
nge", col="dodgerblue")
hist(datos1$median_income, breaks = 20, main = "median_income", border="d
arkorange", col="dodgerblue")
hist(datos1$median_house_value, breaks = 20, main = "median_house_value",
border="darkorange", col="dodgerblue")
Supuestos
### Supuestos
numerica = datos1[,2:10]

numerica = datos1[,2:10]
random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica)
standardized = rstudent(fake)
qqnorm(standardized) abline(0,1)

hist(datos1$longitude)
ks.test(datos1$longitude,"pnorm",mean = mean
(datos1$longitude ),sd=sd(datos1$longitude ))
## Warning in ks.test(datos1$longitude, "pnorm", mean = mean(datos1$longi
tude), :
## ties should not be present for the Kolmogorov-Smirnov test

##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$longitude
## D = 0.20615, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$housing_median_age)
ks.test(datos1$housing_median_age ,"pnorm",mean=mean
(datos1$housing_median_age ),sd=sd(datos1$housing_median_age ))

## Warning in ks.test(datos1$housing_median_age, "pnorm", mean =


## mean(datos1$housing_median_age), : ties should not be present for the
## Kolmogorov-Smirnov test

##
## One-sample Kolmogorov-Smirnov test ##
## data: datos1$housing_median_age
## D = 0.060003, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$total_rooms)
ks.test(datos1$total_rooms ,"pnorm",mean=mean
(datos1$total_rooms ),sd=sd(datos1$total_rooms ))

## Warning in ks.test(datos1$total_rooms, "pnorm", mean =


## mean(datos1$total_rooms), : ties should not be present for the Kolmogo
rov-
## Smirnov test

##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$total_rooms
## D = 0.1344, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$total_bedrooms)
ks.test(datos1$total_bedrooms ,"pnorm",mean=mean
(datos1$total_bedrooms ),sd=sd(datos1$total_bedrooms ))

## Warning in ks.test(datos1$total_bedrooms, "pnorm", mean =


## mean(datos1$total_bedrooms), : ties should not be present for the Kolm
ogorov-
## Smirnov test

##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$total_bedrooms
## D = 0.12855, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$population)
ks.test(datos1$population ,"pnorm",mean=mean
(datos1$population ),sd=sd(datos1$population ))
## Warning in ks.test(datos1$population, "pnorm", mean =
mean(datos1$popu lation), :
## ties should not be present for the Kolmogorov-Smirnov test

##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$population
## D = 0.12413, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$households)
ks.test(datos1$households ,"pnorm",mean=mean
(datos1$households ),sd=sd(datos1$households ))
## Warning in ks.test(datos1$households, "pnorm", mean =
mean(datos1$hous eholds), :
## ties should not be present for the Kolmogorov-Smirnov test

##
## One-sample Kolmogorov-Smirnov test
##
## data: datos1$households
## D = 0.1244, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$median_income)
ks.test(datos1$median_income ,"pnorm",mean=mean
(datos1$median_income ),sd=sd(datos1$median_income ))

## Warning in ks.test(datos1$median_income, "pnorm", mean =


## mean(datos1$median_income), : ties should not be present for the Kolmo
gorov-
## Smirnov test

##
## One-sample Kolmogorov-Smirnov test ##
## data: datos1$median_income
## D = 0.077547, p-value < 2.2e-16 ##
alternative hypothesis: two-sided
hist(datos1$median_house_value)
ks.test(datos1$median_house_value ,"pnorm",mean=mean
(datos1$median_house_value ),sd=sd(datos1$median_house_value ))

## Warning in ks.test(datos1$median_house_value, "pnorm", mean =


## mean(datos1$median_house_value), : ties should not be present for the
## Kolmogorov-Smirnov test

##
## One-sample Kolmogorov-Smirnov test ##
## data: datos1$median_house_value
## D = 0.10172, p-value < 2.2e-16 ##
alternative hypothesis: two-sided

numerica = datos1[,2:10]

random=rchisq(nrow(numerica), 7) fake=lm(random
~ ., data = numerica) standardized =
rstudent(fake)

qqnorm(standardized) abline(0,1)
hist(standardized)

qqnorm(standardized) abline(0,1)
ModeloLineal = lm(longitude ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)
ModeloLineal = lm(latitude ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(housing_median_age ~ ocean_proximity, data= SinAtipicos


)
standardized = rstudent(ModeloLineal) qqnorm(standardized)

abline(0,1)
ModeloLineal = lm(total_rooms ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)
ModeloLineal = lm(total_bedrooms ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized) abline(0,1)

ModeloLineal = lm(population ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)
ModeloLineal = lm(households ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(median_income ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized) abline(0,1)
ModeloLineal = lm(median_house_value ~ ocean_proximity, data= SinAtipicos
)
standardized = rstudent(ModeloLineal) qqnorm(standardized)

abline(0,1)

Se rechaza el
supuesto de linealidad
numerica = datos1[,2:10]

random=rchisq(nrow(numerica), 7) fake=lm(random ~ ., data =


numerica) standardized = rstudent(fake)

fitted= scale(fake$fitted.values) plot(fitted, standardized,


main="Scatterplot") abline(0,0) abline(v=0)
library(stats)
result = fligner.test(longitude ~ ocean_proximity, data = datos1) result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: longitude by ocean_proximity
## Fligner-Killeen:med chi-squared = 4252.1, df = 4, p-value < 2.2e-16

result = fligner.test(latitude ~ ocean_proximity, data = datos1) result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: latitude by ocean_proximity
## Fligner-Killeen:med chi-squared = 4406.2, df = 4, p-value < 2.2e-16
result = fligner.test(housing_median_age ~ ocean_proximity, data = datos1
)
result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: housing_median_age by ocean_proximity
## Fligner-Killeen:med chi-squared = 66.461, df = 4, p-value = 1.266e-13
result = fligner.test(total_rooms ~ ocean_proximity, data = datos1)
result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: total_rooms by ocean_proximity
## Fligner-Killeen:med chi-squared = 61.457, df = 4, p-value = 1.433e-12

result = fligner.test(total_bedrooms ~ ocean_proximity, data = datos1)


result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: total_bedrooms by ocean_proximity
## Fligner-Killeen:med chi-squared = 14.858, df = 4, p-value = 0.005005

result = fligner.test(population ~ ocean_proximity, data = datos1) result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: population by ocean_proximity
## Fligner-Killeen:med chi-squared = 107.83, df = 4, p-value < 2.2e-16

result = fligner.test(households ~ ocean_proximity, data = datos1) result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: households by ocean_proximity
## Fligner-Killeen:med chi-squared = 8.904, df = 4, p-value = 0.06355

result = fligner.test(median_income ~ ocean_proximity, data = datos1)


result

##
## Fligner-Killeen test of homogeneity of variances ##
## data: median_income by ocean_proximity
## Fligner-Killeen:med chi-squared = 402.7, df = 4, p-value < 2.2e-16
result = fligner.test(median_house_value ~ ocean_proximity, data = datos1
)
result

##
## Fligner-Killeen test of homogeneity of variances
##
## data: median_house_value by ocean_proximity
## Fligner-Killeen:med chi-squared = 1845, df = 4, p-value < 2.2e-16

Código RMarkdown

```{r setup, include=FALSE} knitr::opts_chunk$set(echo


= TRUE)
```

```{r load_libraries, include=FALSE}


#Librerias library(knitr)
library(lubridate)
library(VIM)
library(stringr)
library(psych)
library(seqinr)
library(dplyr)
library(Rcpp)
library(naniar)
library(visdat)
library(tidyverse)
library(simputation)
library(knitr)
library(lubridate)
library(VIM)
library(stringr)
library(psych)
library(seqinr)
library(dplyr)
library(Rcpp)
library(rstudioapi)
library(readr)
library(kableExtra)
library(ggplot2)
library(caret)
library(leaps)
library(car)
library(mice)
library(scales)
library(RColorBrewer)
library(plotly)
library(nortest)
library(lmtest)
library(stringr)

```

## Procesamiento y correccion de strings

```{r}
library(readr) archivo <-
'datosHousing.csv' datos =
read.csv(archivo) str(datos)

```
```{r}
summary(datos)
```

```{r chunck1.1, eval=FALSE, echo=FALSE}


# leer datos kable(names(datos))
toString(names(datos))
```

El archivo se denomina *`r archivo`*, contiene `r nrow(datos)` registros y `r ncol(datos)`


variables. Estas variables son: `r toString(names(datos))`

```{r}
table(datos$ocean_proximity)

```

```{r}
kk <- trimws( datos$ocean_proximity ) table(kk
)
```
```{r}
kk <- str_to_title(kk) table(kk)
```
#<1H OCEAN
#INLAND
#ISLAND
#NEAR BAY
#NEAR OCEAN

kk <- sub("<1h Oc","<1H OCEAN",kk) kk <-


sub("<1h Ocean","<1H OCEAN",kk) kk <-
sub("<1H OCEANean", "<1H OCEAN",kk ) kk <-
sub("In Land","INLAND",kk) kk <- sub("Inl
And","INLAND",kk) kk<-
sub("Inland","INLAND",kk) kk <- sub("Nea
Bay", "NEAR BAY",kk) kk<- sub("Near Bay",
"NEAR BAY",kk) kk=toupper(kk) table(kk)

# Update
datos$ocean_proximity <-as.factor(kk)
```
Para comprobar los cambios se presenta la tabla de frecuencias.

```{r}
table(datos$ocean_proximity)
```
```{r}
library(ggplot2)
```{r}

ggplot(datos, aes(x = (ocean_proximity))) +


geom_bar(stat = "count", color = "red", fill = "light blue")

```
## Localizacion de errores

```{r}
# Estadísticas de valores vacíos
sum(is.na(datos)) colSums(is.na(datos))
colSums(datos=="")
```
### longitude
```{r}
longitude <- datos[datos$longitude == "",] dim(longitude)

``` ```{r}
boxplot(datos$longitude)
```

## Correccion de errores

```{r}
boxplot.stats(datos$longitude)$out
```

boxplot.stats(datos$latitude)$out
``` ```{r}
boxplot.stats(datos$housing_median_age)$out
``` ```{r}
boxplot.stats(datos$total_rooms)$out
length(boxplot.stats(datos$total_rooms)$out)
``` ```{r}
boxplot.stats(datos$total_bedrooms)$out
length(boxplot.stats(datos$total_bedrooms)$out)
``` ```{r}
boxplot.stats(datos$population)$out
length(boxplot.stats(datos$population)$out)
```

```{r}
boxplot.stats(datos$households)$out
length(boxplot.stats(datos$households)$out)
``` ```{r}
boxplot.stats(datos$median_income)$out
length(boxplot.stats(datos$median_income)$out)
```
```{r}

boxplot.stats(datos$median_house_value)$out
length(boxplot.stats(datos$median_house_value)$out)
```

```{r}
datosN = datos[,2:10]
Mahal = mahalanobis(datosN, colMeans(datosN, na.rm = T),
cov(datosN, use='pairwise.complete'))
Mahal

```

```{r}
Mahal = mahalanobis(datosN, colMeans(datosN, na.rm = T), cov(datosN,
use='pairwise.complete'))
Mahal

```
```{r}
PuntajeCorte = qchisq(1-0.001, ncol(datosN)-1) # extrae un cuantil de la dist chi
cuadrada
PuntajeCorte
```

```{r}
summary(Mahal<PuntajeCorte)
```
SinAtipicos=subset(datos, Mahal<PuntajeCorte| is.na(Mahal))
SinAtipicos
```

## Imputacion

```{r}
library(VIM) datos1 =
kNN(SinAtipicos) datos1

```

```{r}
sum(is.na(datosN)) sum(is.na(datos1))

```
```{r}
par(mfrow = c(3, 3))
hist(datos1$longitude, breaks = 20, main = "longitude", border="darkorange",
col="dodgerblue")
hist(datos1$latitude, breaks = 20, main = "latitude", border="darkorange",
col="dodgerblue")
hist(datos1$housing_median_age, breaks = 20, main = "housing_median_age",
border="darkorange", col="dodgerblue")
hist(datos1$total_rooms, breaks = 20, main = "total_rooms", border="darkorange",
col="dodgerblue")
hist(datos1$total_bedrooms, breaks = 20, main = "total_bedrooms",
border="darkorange", col="dodgerblue")
```{r}

hist(datos1$population, breaks = 20, main = "population", border="darkorange",


col="dodgerblue")
hist(datos1$households, breaks = 20, main = "households", border="darkorange",
col="dodgerblue")
hist(datos1$median_income, breaks = 20, main = "median_income",
border="darkorange", col="dodgerblue")
hist(datos1$median_house_value, breaks = 20, main = "median_house_value",
border="darkorange", col="dodgerblue")
```
### Supuestos
```{r}
numerica = datos1[,2:10]
```

```{r}
numerica = datos1[,2:10]
random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica)
standardized = rstudent(fake)
qqnorm(standardized) abline(0,1)
```

```{r}
hist(datos1$longitude)
ks.test(datos1$longitude,"pnorm",mean = mean
(datos1$longitude ),sd=sd(datos1$longitude ))
hist(datos1$housing_median_age)
ks.test(datos1$housing_median_age ,"pnorm",mean=mean
(datos1$housing_median_age ),sd=sd(datos1$housing_median_age ))

hist(datos1$total_rooms)
ks.test(datos1$total_rooms ,"pnorm",mean=mean
(datos1$total_rooms ),sd=sd(datos1$total_rooms ))

hist(datos1$total_bedrooms)
ks.test(datos1$total_bedrooms ,"pnorm",mean=mean
(datos1$total_bedrooms ),sd=sd(datos1$total_bedrooms ))

hist(datos1$population)
ks.test(datos1$population ,"pnorm",mean=mean
(datos1$population ),sd=sd(datos1$population ))

hist(datos1$households)
ks.test(datos1$households ,"pnorm",mean=mean
(datos1$households ),sd=sd(datos1$households ))

hist(datos1$median_income)
ks.test(datos1$median_income ,"pnorm",mean=mean
(datos1$median_income ),sd=sd(datos1$median_income ))

hist(datos1$median_house_value)
ks.test(datos1$median_house_value ,"pnorm",mean=mean
(datos1$median_house_value ),sd=sd(datos1$median_house_value ))
```
```{r}
numerica = datos1[,2:10]

random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica) standardized
= rstudent(fake)

qqnorm(standardized) abline(0,1)

hist(standardized)

qqnorm(standardized) abline(0,1)
```

```{r}
ModeloLineal = lm(longitude ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(latitude ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(housing_median_age ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized) abline(0,1)
ModeloLineal = lm(total_rooms ~ ocean_proximity, data= SinAtipicos)
standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(total_bedrooms ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(population ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(households ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(median_income ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized)
abline(0,1)

ModeloLineal = lm(median_house_value ~ ocean_proximity, data= SinAtipicos)


standardized = rstudent(ModeloLineal) qqnorm(standardized) abline(0,1)

```
Se rechaza el supuesto de linealidad

```{r}
numerica = datos1[,2:10]

random=rchisq(nrow(numerica), 7)
fake=lm(random ~ ., data = numerica) standardized
= rstudent(fake)

fitted= scale(fake$fitted.values) plot(fitted,


standardized, main="Scatterplot")
abline(0,0) abline(v=0)

library(stats)
result = fligner.test(longitude ~ ocean_proximity, data = datos1) result
result = fligner.test(latitude ~ ocean_proximity, data = datos1) result

result = fligner.test(housing_median_age ~ ocean_proximity, data = datos1)


result
result = fligner.test(total_rooms ~ ocean_proximity, data = datos1) result
result = fligner.test(total_bedrooms ~ ocean_proximity, data = datos1) result
result = fligner.test(population ~ ocean_proximity, data = datos1) result
result = fligner.test(households ~ ocean_proximity, data = datos1) result
result = fligner.test(median_income ~ ocean_proximity, data = datos1) result
result = fligner.test(median_house_value ~ ocean_proximity, data = datos1)
result
```

```{r}
setwd("C:/Users/Usuario1/Desktop/Maestría/Modulo1/Proyecto
final/Datos_corregidos")
write.table(datos1,file = "datos_corregidos.csv",sep = "i",row.names = F)
```

También podría gustarte