Documentos de Académico
Documentos de Profesional
Documentos de Cultura
org.apache.spark.mllib.regression.LabeledPoint
org.apache.spark.mllib.regression.LinearRegressionModel
org.apache.spark.mllib.regression.LinearRegressionWithSGD
org.apache.spark.mllib.linalg.Vectors
2. Apache Kafka:
Run following exercise in Apache Kafka. Please explain what is Kafka?
Use? Pros Cons ? Usecase:
Step by Step explain how you ran the exercise and pain points,
challenges you faced in running following exercise. Also explain the
output and give business context on how you will use it in real life
business and what is Truck Event exercise doing and why we use Kafka
for this purpose?
Refer to following linked file for installation and other details
https://www.dropbox.com/s/7qd6pr9xixaoaug/Installation_Exercises_Ne
tappD4.pptx?dl=0
Make sure you have installed Kafka following steps in above link. Then
you can run code step by step to get the results
2. SAS: What is PROC and DATA STEP in SAS? Give example of some
of the most used SAS PROC and explain its use in Business
context?
Run following SAS code for Multiple Linear Regression to get
Multicollinearity and influence statistics (from SAS Manual) in
SAS Virtual machine and explain results in detail in Statistical
context.
options linesize=80;
data fitness;
input age weight oxy runtime rstpulse runpulse maxpulse;
cards;
44 89.47 44.609 11.37 62 178 182
40 75.07 45.313 10.07 62 185 185
44 85.84 54.297 8.65 45 156 168
42 68.15 59.571 8.17 40 166 172
38 89.02 49.874 9.22 55 178 180
47 77.45 44.811 11.63 58 176 176
40 75.98 45.681 11.95 70 176 180
43 81.19 49.091 10.85 64 162 170
44 81.42 39.442 13.08 63 174 176
38 81.87 60.055 8.63 48 170 186
44 73.03 50.541 10.13 45 168 168
45 87.66 37.388 14.03 56 186 192
45 66.45 44.754 11.12 51 176 176
47 79.15 47.273 10.60 47 162 164
54 83.12 51.855 10.33 50 166 170
49 81.42 49.156 8.95 44 180 185
51 69.63 40.836 10.95 57 168 172
51 77.91 46.672 10.00 48 162 168
48 91.63 46.774 10.25 48 162 164
49 73.37 50.388 10.08 67 168 168
57 73.37 39.407 12.63 58 174 176
54 79.38 46.080 11.17 62 156 165
52 76.32 45.441 9.63 48 164 166
50 70.87 54.625 8.92 48 146 155
51 67.25 45.118 11.08 48 172 172
54 91.63 39.203 12.88 44 168 172
51 73.71 45.790 10.47 59 186 188
57 59.08 50.545 9.93 49 148 155
49 76.32 48.673 9.40 56 186 188
48 61.24 47.920 11.50 52 170 176
52 82.78 47.467 10.50 53 170 172
;
title 'SAS Fitness data';
/*file:mammalsteeth.sas
ExampleofclusteranalysistakenfromExample
4oftheSASdocumentationtoPROCCLUSTER*/
optionsnocenternodatepageno=1linesize=132;
titleh=1j=l'File:cluster.mammalsteeth.sas';
title2h=1j=l'ClusterAnalysisofMammals''teeth
data';
datateeth;
inputmammal$116
@21(v1v8)(1.);
labelv1='Topincisors'
v2='Bottomincisors'
v3='Topcanines'
v4='Bottomcanines'
v5='Toppremolars'
v6='Bottompremolars'
v7='Topmolars'
v8='Bottommolars';
cards;
BROWNBAT23113333
MOLE32103333
SILVERHAIRBAT23112333
PIGMYBAT23112233
HOUSEBAT23111233
REDBAT13112233
PIKA21002233
RABBIT21003233
BEAVER11002133
GROUNDHOG11002133
GRAYSQUIRREL11001133
HOUSEMOUSE11000033
PORCUPINE11001133
WOLF33114423
BEAR33114423
RACCOON33114432
MARTEN33114412
WEASEL33113312
WOLVERINE33114412
BADGER33113312
RIVEROTTER33114312
SEAOTTER32113312
JAGUAR33113211
COUGAR33113211
FURSEAL32114411
SEALION32114411
GREYSEAL32113322
ELEPHANTSEAL21114411
REINDEER04103333
ELK04103333
DEER04003333
MOOSE04003333
;
/*principalcomponentsanalysisofteeth
herewescoretheprincipalcomponentsand
outputthentodatasetteeth2*/
procprincompdata=teethout=teeth2;
varv1v8;
run;
/*averagelinkageclusteranalysis
adendrogram(treediagram)isalsooutput*/
procclusterdata=teeth2method=averageouttree=ttree
cccpseudorsquare;
varv1v8;
idmammal;
run;
/*PROCTREEprintsthetreediagram
herewealsooutputadataset,calledttree2
thatcontainsfourclusters*/
proctreedata=ttreeout=ttree2nclusters=4;
idmammal;
run;
/*thenextsetofstatementssortthedatasets
byvariablemammalandthenmergethetreedataset
(withtheclusterscores)withtheteethdataset
(withtheprinicipalcomponents)*/
procsortdata=teeth2;
bymammal;
run;
procsortdata=ttree2;
bymammal;
run;
datateeth3;
mergeteeth2ttree2;
bymammal;
run;
/*stuffforplotting*/
symbol1c=blackf=,v='1';
symbol2c=blackf=,v='2';
symbol3c=blackf=,v='3';
symbol4c=blackf=,v='4';
procgplot;
plotprin2*prin1=cluster;
run;
procsort;
bycluster;
run;
procprint;
bycluster;
varmammalprin1prin2;
run;
#Loadthedataset
data(Groceries)
#Createanitemfrequencyplotforthetop20items
itemFrequencyPlot(Groceries,topN=20,type="absolute")
#Gettherules
rules<apriori(Groceries,parameter=list(supp=0.001,
conf=0.8))
#Showthetop5rules,butonly2digits
options(digits=2)
inspect(rules[1:5])
rules<sort(rules,by="confidence",decreasing=TRUE)
rules<apriori(Groceries,parameter=list(supp=0.001,
conf=0.8,maxlen=3))
subset.matrix<is.subset(rules,rules)
subset.matrix[lower.tri(subset.matrix,diag=T)]<NA
redundant<colSums(subset.matrix,na.rm=T)>=1
rules.pruned<rules[!redundant]
rules<rules.pruned
rules<apriori(data=Groceries,
parameter=list(supp=0.001,conf=0.08),
appearance=list(default="lhs",rhs="whole
milk"),
control=list(verbose=F))
rules<sort(rules,decreasing=TRUE,by="confidence")
inspect(rules[1:5])
rules<apriori(data=Groceries,
parameter=list(supp=0.001,conf=0.15,minlen=2),appearance=
list(default="rhs",lhs="wholemilk"),control=
list(verbose=F))
rules<sort(rules,decreasing=TRUE,by="confidence")
inspect(rules[1:5])
library(arulesViz)
plot(rules,method="graph",interactive=TRUE,shading=NA)
BonusQuestion: