Está en la página 1de 3

DESARROLLO MODELO RANDOM FOREST

Preparamos el entorno de Spark


In [1]: #Utilizamos findspark para localizar nuestra instalación de Spark
import findspark
findspark.init()
#Creamos la sesión de aplicación de Spark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

configuracion = SparkConf().setAppName('Clasificacion usando Random Forest').setMaster('local')


sc = SparkContext(conf=configuracion)
spark = SparkSession(sc)

Preparamos el algoritmo e importamos la data original


In [2]: # Imports
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Import data
dataset = spark.read.csv(path="diabetes2.csv",header="true",inferSchema="true")
dataset.printSchema()

root
|-- PatientID: integer (nullable = true)
|-- Pregnancies: integer (nullable = true)
|-- PlasmaGlucose: integer (nullable = true)
|-- DiastolicBloodPressure: integer (nullable = true)
|-- TricepsThickness: integer (nullable = true)
|-- SerumInsulin: integer (nullable = true)
|-- BMI: double (nullable = true)
|-- DiabetesPedigree: double (nullable = true)
|-- Age: integer (nullable = true)
|-- Diabetic: integer (nullable = true)

Separamos la data en caracteristicas y etiquetas


In [3]: # Inicialización de vectorizador

assembler = VectorAssembler(
inputCols=['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin', 'BMI','DiabetesPedigree','Age'],
outputCol="features")

output=assembler.transform(dataset)

datain=output.select("features", "Diabetic")
datain=datain.withColumnRenamed('Diabetic', 'label')

#creamos una columna de caracteristicas normalizadas withStd=True, withMean=False


from pyspark.ml.feature import StandardScaler

dbscaler = StandardScaler(inputCol="features", outputCol="featuresNTF", withStd=True, withMean=False)


db_scalerModel = dbscaler.fit(datain)
db_scaledData = db_scalerModel.transform(datain)

# División de la data en 20% test y 80% entrenamiento

db_scaledData=db_scaledData.cache()
splits = db_scaledData.randomSplit([0.8, 0.2], 1771)
df_training = splits[0].cache()
df_test = splits[1].cache()

display(df_training)

DataFrame[features: vector, label: int, featuresNTF: vector]

Implementamos el algoritmo Random Forest


In [5]: def trainModel(feature):
# Inicialización de RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol=feature, numTrees=15)

model = rf.fit(df_training)

predictions=model.transform(df_test)

result = predictions.select('label',feature, 'rawPrediction', 'probability', 'prediction')

return result

Implementación Sin normalizar


In [6]: resultadoSinNormalizar=trainModel('features')
resultadoSinNormalizar.show()

+-----+--------------------+--------------------+--------------------+----------+
|label| features| rawPrediction| probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
| 0|[0.0,44.0,44.0,26...|[14.7792521964130...|[0.98528347976086...| 0.0|
| 0|[0.0,44.0,100.0,4...|[14.8296103225928...|[0.98864068817285...| 0.0|
| 0|[0.0,46.0,61.0,46...|[10.7553964035555...|[0.71702642690370...| 0.0|
| 0|[0.0,49.0,60.0,15...|[14.8269154172880...|[0.98846102781920...| 0.0|
| 0|[0.0,51.0,60.0,40...|[14.8561258999000...|[0.99040839332667...| 0.0|
| 0|[0.0,51.0,88.0,41...|[14.4552300851177...|[0.96368200567451...| 0.0|
| 0|[0.0,52.0,86.0,35...|[14.4242122322541...|[0.96161414881694...| 0.0|
| 0|[0.0,53.0,47.0,11...|[13.6735199251111...|[0.91156799500741...| 0.0|
| 0|[0.0,53.0,97.0,27...|[14.4552300851177...|[0.96368200567451...| 0.0|
| 0|[0.0,54.0,78.0,10...|[14.7177971976147...|[0.98118647984098...| 0.0|
| 0|[0.0,55.0,54.0,17...|[14.6109883991019...|[0.97406589327346...| 0.0|
| 0|[0.0,55.0,57.0,46...|[14.4898788421022...|[0.96599192280681...| 0.0|
| 0|[0.0,55.0,62.0,11...|[14.4817456624249...|[0.96544971082833...| 0.0|
| 1|[0.0,55.0,80.0,15...|[10.6967129010831...|[0.71311419340554...| 0.0|
| 0|[0.0,55.0,86.0,10...|[14.6775913913405...|[0.97850609275603...| 0.0|
| 0|[0.0,57.0,89.0,32...|[14.4715966216522...|[0.96477310811014...| 0.0|
| 0|[0.0,57.0,97.0,56...|[13.4572219932048...|[0.89714813288032...| 0.0|
| 0|[0.0,58.0,45.0,21...|[14.5291300518230...|[0.96860867012153...| 0.0|
| 0|[0.0,58.0,50.0,7....|[14.7792521964130...|[0.98528347976086...| 0.0|
| 0|[0.0,58.0,67.0,25...|[14.4715966216522...|[0.96477310811014...| 0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 20 rows

Implementación Normalizada
In [7]: resultadoNormalizado=trainModel('featuresNTF')
resultadoNormalizado.show()

+-----+--------------------+--------------------+--------------------+----------+
|label| featuresNTF| rawPrediction| probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
| 0|[0.0,1.3704197436...|[14.7792521964130...|[0.98528347976086...| 0.0|
| 0|[0.0,1.3704197436...|[14.8296103225928...|[0.98864068817285...| 0.0|
| 0|[0.0,1.4327115501...|[10.7553964035555...|[0.71702642690370...| 0.0|
| 0|[0.0,1.5261492599...|[14.8269154172880...|[0.98846102781920...| 0.0|
| 0|[0.0,1.5884410664...|[14.8561258999000...|[0.99040839332667...| 0.0|
| 0|[0.0,1.5884410664...|[14.4552300851177...|[0.96368200567451...| 0.0|
| 0|[0.0,1.6195869697...|[14.4242122322541...|[0.96161414881694...| 0.0|
| 0|[0.0,1.6507328730...|[13.6735199251111...|[0.91156799500741...| 0.0|
| 0|[0.0,1.6507328730...|[14.4552300851177...|[0.96368200567451...| 0.0|
| 0|[0.0,1.6818787762...|[14.7177971976147...|[0.98118647984098...| 0.0|
| 0|[0.0,1.7130246795...|[14.6109883991019...|[0.97406589327346...| 0.0|
| 0|[0.0,1.7130246795...|[14.4898788421022...|[0.96599192280681...| 0.0|
| 0|[0.0,1.7130246795...|[14.4817456624249...|[0.96544971082833...| 0.0|
| 1|[0.0,1.7130246795...|[10.6967129010831...|[0.71311419340554...| 0.0|
| 0|[0.0,1.7130246795...|[14.6775913913405...|[0.97850609275603...| 0.0|
| 0|[0.0,1.7753164860...|[14.4715966216522...|[0.96477310811014...| 0.0|
| 0|[0.0,1.7753164860...|[13.4572219932048...|[0.89714813288032...| 0.0|
| 0|[0.0,1.8064623893...|[14.5291300518230...|[0.96860867012153...| 0.0|
| 0|[0.0,1.8064623893...|[14.7792521964130...|[0.98528347976086...| 0.0|
| 0|[0.0,1.8064623893...|[14.4715966216522...|[0.96477310811014...| 0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 20 rows

Evaluamos los modelos


In [8]: #Funcion para evaluar los modelos usando la matriz de confusion

def evaluateModel(resultado):
positive = resultado[(resultado['label'] == 1) & (resultado['prediction'] == 1.0)]
negative = resultado[(resultado['label'] == 0) & (resultado['prediction'] == 0.0)]
false_positive = resultado[(resultado['label'] == 0) & (resultado['prediction'] == 1.0)]
false_negative = resultado[(resultado['label'] == 1) & (resultado['prediction'] == 0.0)]

tot = resultado.count()
vp = positive.count()
vn = negative.count()
fp = false_positive.count()
fn = false_negative.count()

print(f'Cantidad Total {tot}')


print(f'Cantidad de Valores Positivos (VP) {vp}')
print(f'Cantidad de Valores Negativos (VN) {vn}')
print(f'Cantidad de Falsos Positivos (FP) {fp}')
print(f'Cantidad de Falsos Negativos (FN) {fn}')
exactitud = (vp + vn)/tot
tasa_error = (fp + fn)/tot
sensibilidad = vp/(vp+fp)
especificidad = (vn/(vn+fn))

print(f'El porcentaje de exactitud es de {exactitud}')


print(f'La tasa de error es de {tasa_error}')
print(f'El porcentaje de sensibilidad es de {sensibilidad}')
print(f'El porcentaje de especificidad es de {especificidad}')

Evaluamos el modelo
In [9]: evaluateModel(resultadoSinNormalizar)

Cantidad Total 1025


Cantidad de Valores Positivos (VP) 296
Cantidad de Valores Negativos (VN) 640
Cantidad de Falsos Positivos (FP) 48
Cantidad de Falsos Negativos (FN) 41
El porcentaje de exactitud es de 0.9131707317073171
La tasa de error es de 0.08682926829268292
El porcentaje de sensibilidad es de 0.8604651162790697
El porcentaje de especificidad es de 0.9397944199706314
In [10]: evaluateModel(resultadoNormalizado)

Cantidad Total 1025


Cantidad de Valores Positivos (VP) 296
Cantidad de Valores Negativos (VN) 640
Cantidad de Falsos Positivos (FP) 48
Cantidad de Falsos Negativos (FN) 41
El porcentaje de exactitud es de 0.9131707317073171
La tasa de error es de 0.08682926829268292
El porcentaje de sensibilidad es de 0.8604651162790697
El porcentaje de especificidad es de 0.9397944199706314

In [ ]:

También podría gustarte