我使用 Spark 1.6.0 和 Scala 2.10.5。
$ spark-shell --packages com.databricks:spark-csv_2.10:1.5.0
import org.apache.spark.sql.SQLContext
import sqlContext.implicits._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
val bankSchema = StructType(Array(
StructField("age", IntegerType, true),
StructField("job", StringType, true),
StructField("marital", StringType, true),
StructField("education", StringType, true),
StructField("default", StringType, true),
StructField("balance", IntegerType, true),
StructField("housing", StringType, true),
StructField("loan", StringType, true),
StructField("contact", StringType, true),
StructField("day", IntegerType, true),
StructField("month", StringType, true),
StructField("duration", IntegerType, true),
StructField("campaign", IntegerType, true),
StructField("pdays", IntegerType, true),
StructField("previous", IntegerType, true),
StructField("poutcome", StringType, true),
StructField("y", StringType, true)))
val market_details = sqlContext.
read.
format("com.databricks.spark.csv").
option("header", "true").
schema(bankSchema).
load("/user/sachnil.2007_gmail/Project1_dataset_bank-full.csv")
market_details.registerTempTable("phone_table")
val temp = sqlContext.sql("SELECT * FROM phone_table").show()
我得到的错误是:
17/05/14 06:11:42 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1) java.lang.NumberFormatException: For input string: "58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"" at
java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) at
java.lang.Integer.parseInt(Integer.java:580) at
java.lang.Integer.parseInt(Integer.java:615) at
scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229) at
scala.collection.immutable.StringOps.toInt(StringOps.scala:31) at
com.databricks.spark.csv.util.TypeCast$.castTo(TypeCast.scala:61) at
com.databricks.spark.csv.CsvRelation$$anonfun$buildScan$2.apply(CsvRelation.scala:121) at
com.databricks.spark.csv.CsvRelation$$anonfun$buildScan$2.apply(CsvRelation.scala:108) at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308) at
scala.collection.Iterator$class.foreach(Iterator.scala:727) at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47) at
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273) at scala.collection.AbstractIterator.to(Iterator.scala:1157) at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265) at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157) at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252) at
scala.collection.AbstractIterator.toArray(Iterator.scala:1157) at org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:212) at
org.apache.spark.sql.execution.SparkPlan$$anonfun$5.apply(SparkPlan.scala:212)
CSV 内容如下所示:
"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
我该如何解决?