我想读取文件数据并检查 Cassandra 中是否存在文件行数据,如果存在则需要合并,否则将新插入到 C*。文件数据只包含名称,json格式的地址,在Cassandra学生表中以UUID为主键,名称上有二级索引
将数据合并到 cassandra 后,我想将新的 UUID 或现有的 UUID 发送到 KAfka。
当我在 mesos 集群上的本地或单台机器上运行时(保持线 sparkConf.setMaster("local[4]");) 这个程序可以工作但是当我提交给带有 4 个从属的 mesos master 时(注释行 //sparkConf.setMaster(" local[4]"); 在集群上)在 javastreaming 上下文中从 Cassandra 选择数据时存在空指针
我将流上下文设为静态,因为它在文件 dstream 的映射转换中被访问时引发了序列化异常。
这种方法有什么问题吗?是因为我正在尝试使用 DStream 地图转换构建 Cassandra RDD,这会导致问题吗?
import kafka.producer.KeyedMessage;
import com.datastax.spark.connector.japi.CassandraStreamingJavaUtil;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import java.util.Properties;
import java.util.UUID;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.cloudera.spark.streaming.kafka.JavaDStreamKafkaWriter;
import org.cloudera.spark.streaming.kafka.JavaDStreamKafkaWriterFactory;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapRowTo;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapToRow;
public class DStreamExample {
public DStreamExample() {
}
private static JavaStreamingContext ssc;
public static void main(final String[] args) {
final SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("SparkJob");
sparkConf.setMaster("local[4]"); // for local
sparkConf.set("spark.cassandra.connection.host", cassandra_hosts);
ssc = new JavaStreamingContext(sparkConf,new Duration(2000));
final JavaDStream<Student> studentFileDStream = ssc.textFileStream(
"/usr/local/fileDir/").map(line -> {
final Gson gson = new Gson();
final JsonParser parser = new JsonParser();
final JsonObject jsonObject = parser.parse(line)
.getAsJsonObject();
// generating new UUID
studentFile.setId(UUID.randomUUID());
final Student studentFile = gson.fromJson(jsonObject,
Student.class);
try{
//NullPointer at this line while running on cluster
final JavaRDD<Student> cassandraStudentRDD =
CassandraStreamingJavaUtil.javaFunctions(ssc)
.cassandraTable("keyspace", "student",
mapRowTo(Student.class)).where("name=?",
studentFile.getName());
//If student name is found in cassandra table then assign UUID to fileStudent object
//This way i wont create multiple records for same name student
final Student studentCassandra = cassandraStudentRDD.first();
studentFile.setId(studentCassandra.getId());
}catch(Exception e){
}
return studentFile;
});
//Save student to Cassandra
CassandraStreamingJavaUtil.javaFunctions(studentFileDStream)
.writerBuilder("keyspace", "student", mapToRow(Student.class))
.saveToCassandra();
final JavaDStreamKafkaWriter<Student> writer =
JavaDStreamKafkaWriterFactory.fromJavaDStream(studentFileDStream);
final Properties properties = new Properties();
properties.put("metadata.broker.list", "server:9092");
properties.put("serializer.class", "kafka.serializer.StringEncoder");
//Just send studnet UUID_PUT to kafka
writer.writeToKafka(properties,
student ->
new KeyedMessage<>("TOPICNAME", student.getId() + "_PUT"));
ssc.start();
ssc.awaitTermination();
}
}
class Student {
private String address;
private UUID id;
private String name;
public Student() {
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public UUID getId() {
return id;
}
public void setId(UUID id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
异常堆栈跟踪::
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, servername): java.lang.NullPointerException
at com.datastax.spark.connector.japi.CassandraStreamingJavaUtil.javaFunctions(CassandraStreamingJavaUtil.java:39)
at com.ebates.ps.batch.sparkpoc.DStreamPOCExample.lambda$main$d2c4cc2c$1(DStreamPOCExample.java:109)
at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.cloudera.spark.streaming.kafka.RDDKafkaWriter$$anonfun$writeToKafka$1.apply(RDDKafkaWriter.scala:47)
at org.cloudera.spark.streaming.kafka.RDDKafkaWriter$$anonfun$writeToKafka$1.apply(RDDKafkaWriter.scala:45)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:898)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:898)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1822)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1835)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1848)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:898)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:896)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:896)