やる気の感じられないサンプルなのは、動作メモのつもりで作成しているため…
recommendation
# -*- coding: utf-8 -*- import pyspark from pyspark.ml import recommendation from pyspark.ml import evaluation def main(): conf = pyspark.SparkConf() conf.setAppName("nanohain sample3") sc = pyspark.SparkContext(conf=conf) sc.setLogLevel("ERROR") scSQL = pyspark.sql.SQLContext(sc) listData = [ {"user": 1, "item": 1, "rating": 90.5}, {"user": 1, "item": 2, "rating": 90.5}, {"user": 1, "item": 3, "rating": 90.5}, {"user": 2, "item": 1, "rating": 50.0}, {"user": 2, "item": 2, "rating": 50.0}, {"user": 2, "item": 3, "rating": 50.0}, {"user": 3, "item": 1, "rating": 80.0}, {"user": 3, "item": 2, "rating": 100.0}, {"user": 3, "item": 3, "rating": 100.0} ] dfData = scSQL.createDataFrame(listData) # 学習用と評価用に分類する際は以下の様に分離出来ます。 # dfLearn, dfCheck = dfData.randomSplit([0.8, 0.2]) mlAls = recommendation.ALS( maxIter=5, regParam=0.01, implicitPrefs=False, userCol="user", itemCol="item", ratingCol="rating" ) model = mlAls.fit(dfData) predictions = model.transform(dfData) for r in predictions.collect(): print r evaluator = evaluation.RegressionEvaluator( metricName="rmse", labelCol="rating", predictionCol="prediction" ) rmse = evaluator.evaluate(predictions) print "rmse = ", rmse if __name__ =="__main__": main()
naive beyes
# -*- coding: utf-8 -*- import pyspark from pyspark.ml import classification from pyspark.ml import evaluation from pyspark.ml import linalg def main(): conf = pyspark.SparkConf() conf.setAppName("nanohain sample4") sc = pyspark.SparkContext(conf=conf) sc.setLogLevel("ERROR") scSQL = pyspark.sql.SQLContext(sc) # ラベルは0からの連番で設定 listData = [ {"label": 0, "features": linalg.Vectors.dense([1.0, 0.0, 0.0, 0.0, 0.0, 5.0])}, {"label": 1, "features": linalg.Vectors.dense([1.0, 1.0, 0.0, 0.0, 0.0, 5.0])}, {"label": 2, "features": linalg.Vectors.dense([1.0, 0.0, 1.0, 0.0, 0.0, 5.0])}, {"label": 3, "features": linalg.Vectors.dense([1.0, 0.0, 0.0, 1.0, 0.0, 5.0])}, {"label": 4, "features": linalg.Vectors.dense([1.0, 0.0, 0.0, 0.0, 1.0, 5.0])}, ] dfData = scSQL.createDataFrame(listData) # 学習用と評価用に分類する際は以下の様に分離出来ます。 # dfLearn, dfCheck = dfData.randomSplit([0.8, 0.2]) mlNBeyes = classification.NaiveBayes(smoothing=1.0, modelType="multinomial") model = mlNBeyes.fit(dfData) predictions = model.transform(dfData) for r in predictions.collect(): print r evaluator = evaluation.RegressionEvaluator( metricName="rmse", labelCol="label", predictionCol="prediction" ) rmse = evaluator.evaluate(predictions) print "rmse = ", rmse if __name__ =="__main__": main()