Apache Spark 2.1.0(5)

やる気の感じられないサンプルなのは、動作メモのつもりで作成しているため…

recommendation

# -*- coding: utf-8 -*-
import pyspark
from pyspark.ml import recommendation
from pyspark.ml import evaluation

def main():

    conf = pyspark.SparkConf()
    conf.setAppName("nanohain sample3")
    sc = pyspark.SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    scSQL = pyspark.sql.SQLContext(sc)

    listData = [
        {"user": 1, "item": 1, "rating":  90.5},
        {"user": 1, "item": 2, "rating":  90.5},
        {"user": 1, "item": 3, "rating":  90.5},
        {"user": 2, "item": 1, "rating":  50.0},
        {"user": 2, "item": 2, "rating":  50.0},
        {"user": 2, "item": 3, "rating":  50.0},
        {"user": 3, "item": 1, "rating":  80.0},
        {"user": 3, "item": 2, "rating": 100.0},
        {"user": 3, "item": 3, "rating": 100.0}
    ]

    dfData = scSQL.createDataFrame(listData)
    # 学習用と評価用に分類する際は以下の様に分離出来ます。
    # dfLearn, dfCheck = dfData.randomSplit([0.8, 0.2])

    mlAls = recommendation.ALS(
        maxIter=5, regParam=0.01, implicitPrefs=False,
        userCol="user", itemCol="item", ratingCol="rating"
    )
    model = mlAls.fit(dfData)

    predictions = model.transform(dfData)
    for r in predictions.collect():
        print r

    evaluator = evaluation.RegressionEvaluator(
        metricName="rmse", labelCol="rating", predictionCol="prediction"
    )
    rmse = evaluator.evaluate(predictions)

    print "rmse = ", rmse


if __name__ =="__main__":
    main()

naive beyes

# -*- coding: utf-8 -*-
import pyspark
from pyspark.ml import classification
from pyspark.ml import evaluation
from pyspark.ml import linalg

def main():

    conf = pyspark.SparkConf()
    conf.setAppName("nanohain sample4")
    sc = pyspark.SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    scSQL = pyspark.sql.SQLContext(sc)

    # ラベルは0からの連番で設定
    listData = [
        {"label":  0, "features": linalg.Vectors.dense([1.0, 0.0, 0.0, 0.0, 0.0, 5.0])},
        {"label":  1, "features": linalg.Vectors.dense([1.0, 1.0, 0.0, 0.0, 0.0, 5.0])},
        {"label":  2, "features": linalg.Vectors.dense([1.0, 0.0, 1.0, 0.0, 0.0, 5.0])},
        {"label":  3, "features": linalg.Vectors.dense([1.0, 0.0, 0.0, 1.0, 0.0, 5.0])},
        {"label":  4, "features": linalg.Vectors.dense([1.0, 0.0, 0.0, 0.0, 1.0, 5.0])},
    ]

    dfData = scSQL.createDataFrame(listData)
    # 学習用と評価用に分類する際は以下の様に分離出来ます。
    # dfLearn, dfCheck = dfData.randomSplit([0.8, 0.2])

    mlNBeyes = classification.NaiveBayes(smoothing=1.0, modelType="multinomial")
    model = mlNBeyes.fit(dfData)

    predictions = model.transform(dfData)
    for r in predictions.collect():
        print r

    evaluator = evaluation.RegressionEvaluator(
        metricName="rmse", labelCol="label", predictionCol="prediction"
    )
    rmse = evaluator.evaluate(predictions)

    print "rmse = ", rmse


if __name__ =="__main__":
    main()