ScalaでSpark
適宜追加予定
//CSV読み込み scala> val df = spark.read .format("csv").option("header", "true").option("mode", "DROPMALFORMED").option("inferSchema","True").load("iris.csv") // 各列の型表示 scala> df.printSchema() //表示(デフォルトは20行) scala> df.show() //SQL用の仮テーブル名設定 scala> df.createGlobalTempView("iris") //SQLによる列の四則演算(テーブル名必要、.show()で結果出力) scala> spark.sql("SELECT SepalLength+SepalWidth FROM global_temp.iris2").show() //列名の取得と列名スライス scala> val df_columns = df.columns.slice(1,3) df_sep: Array[String] = Array(SepalWidth, PetalLength) //列方向のスライス(列番号スライス⇨抽出データの取得) scala> val df_columns = df.columns.slice(1,4) df_columns: Array[String] = Array(SepalWidth, PetalLength, PetalWidth) scala> df.select(df_columns.head,df_columns.tail:_*).show(3) +----------+-----------+----------+ |SepalWidth|PetalLength|PetalWidth| +----------+-----------+----------+ | 3.5| 1.4| 0.2| | 3.0| 1.4| 0.2| | 3.2| 1.3| 0.2| +----------+-----------+----------+ //行方向のスライス import org.apache.spark.sql.functions._ scala> val idxDf = df.withColumn("idx", monotonicallyIncreasingId()) scala> idxDf.show(3) +-----------+----------+-----------+----------+-----------+---+ |SepalLength|SepalWidth|PetalLength|PetalWidth| Name|idx| +-----------+----------+-----------+----------+-----------+---+ | 5.1| 3.5| 1.4| 0.2|Iris-setosa| 0| | 4.9| 3.0| 1.4| 0.2|Iris-setosa| 1| | 4.7| 3.2| 1.3| 0.2|Iris-setosa| 2| +-----------+----------+-----------+----------+-----------+---+ scala> val ex_Df1 = idxDf.filter("idx > 10") scala> val ex_Df = ex_Df1.filter("idx < 15") scala> ex_Df.show() +-----------+----------+-----------+----------+-----------+---+ |SepalLength|SepalWidth|PetalLength|PetalWidth| Name|idx| +-----------+----------+-----------+----------+-----------+---+ | 4.8| 3.4| 1.6| 0.2|Iris-setosa| 11| | 4.8| 3.0| 1.4| 0.1|Iris-setosa| 12| | 4.3| 3.0| 1.1| 0.1|Iris-setosa| 13| | 5.8| 4.0| 1.2| 0.2|Iris-setosa| 14| +-----------+----------+-----------+----------+-----------+---+
参考
python - Splitting DataFrames in Apache Spark - Stack Overflow
scala - Get a range of columns of Spark RDD - Stack Overflow