2

我想读取文件数据并检查 Cassandra 中是否存在文件行数据,如果存在则需要合并,否则将新插入到 C*。文件数据只包含名称,json格式的地址,在Cassandra学生表中以UUID为主键,名称上有二级索引

将数据合并到 cassandra 后,我想将新的 UUID 或现有的 UUID 发送到 KAfka。

当我在 mesos 集群上的本地或单台机器上运行时(保持线 sparkConf.setMaster("local[4]");) 这个程序可以工作但是当我提交给带有 4 个从属的 mesos master 时(注释行 //sparkConf.setMaster(" local[4]"); 在集群上)在 javastreaming 上下文中从 Cassandra 选择数据时存在空指针

我将流上下文设为静态,因为它在文件 dstream 的映射转换中被访问时引发了序列化异常。

这种方法有什么问题吗?是因为我正在尝试使用 DStream 地图转换构建 Cassandra RDD,这会导致问题吗?

import kafka.producer.KeyedMessage;

import com.datastax.spark.connector.japi.CassandraStreamingJavaUtil;

import com.google.gson.Gson;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

import java.util.Properties;
import java.util.UUID;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import org.cloudera.spark.streaming.kafka.JavaDStreamKafkaWriter;
import org.cloudera.spark.streaming.kafka.JavaDStreamKafkaWriterFactory;

import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapRowTo;
import static com.datastax.spark.connector.japi.CassandraJavaUtil.mapToRow;


public class DStreamExample {


    public DStreamExample() {
    }

  private static JavaStreamingContext ssc;

    public static void main(final String[] args) {
        final SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName("SparkJob");

        sparkConf.setMaster("local[4]"); // for local
        sparkConf.set("spark.cassandra.connection.host", cassandra_hosts);


        ssc = new JavaStreamingContext(sparkConf,new Duration(2000));


        final JavaDStream<Student> studentFileDStream = ssc.textFileStream(
                "/usr/local/fileDir/").map(line -> {
                final Gson gson = new Gson();
                final JsonParser parser = new JsonParser();

                final JsonObject jsonObject = parser.parse(line)
                    .getAsJsonObject();

                // generating new UUID     
                studentFile.setId(UUID.randomUUID());

                final Student studentFile = gson.fromJson(jsonObject,
                        Student.class);

            try{
                //NullPointer at this line while running on cluster
                final JavaRDD<Student> cassandraStudentRDD =
                    CassandraStreamingJavaUtil.javaFunctions(ssc)
                    .cassandraTable("keyspace", "student",
                        mapRowTo(Student.class)).where("name=?",
                        studentFile.getName());


                //If student name is found in cassandra table then assign UUID to fileStudent object
                //This way i wont create multiple records for same name student
                final Student studentCassandra = cassandraStudentRDD.first();
                studentFile.setId(studentCassandra.getId());

                }catch(Exception e){

                }
                return studentFile;

            });

            //Save student to Cassandra
        CassandraStreamingJavaUtil.javaFunctions(studentFileDStream)
            .writerBuilder("keyspace", "student", mapToRow(Student.class))
            .saveToCassandra();


        final JavaDStreamKafkaWriter<Student> writer =
            JavaDStreamKafkaWriterFactory.fromJavaDStream(studentFileDStream);


        final Properties properties = new Properties();
        properties.put("metadata.broker.list", "server:9092");
        properties.put("serializer.class", "kafka.serializer.StringEncoder");

        //Just send studnet UUID_PUT to kafka    
        writer.writeToKafka(properties,
            student ->
                new KeyedMessage<>("TOPICNAME", student.getId() + "_PUT"));

        ssc.start();
        ssc.awaitTermination();


    }

}

class Student {
    private String address;
    private UUID id;
    private String name;

    public Student() {
    }

    public String getAddress() {
        return address;
    }

    public void setAddress(String address) {
        this.address = address;
    }

    public UUID getId() {
        return id;
    }

    public void setId(UUID id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }
}

异常堆栈跟踪::

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, servername): java.lang.NullPointerException
        at com.datastax.spark.connector.japi.CassandraStreamingJavaUtil.javaFunctions(CassandraStreamingJavaUtil.java:39)
        at com.ebates.ps.batch.sparkpoc.DStreamPOCExample.lambda$main$d2c4cc2c$1(DStreamPOCExample.java:109)
        at org.apache.spark.api.java.JavaPairRDD$$anonfun$toScalaFunction$1.apply(JavaPairRDD.scala:1027)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
        at scala.collection.Iterator$class.foreach(Iterator.scala:727)
        at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
        at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
        at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
        at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
        at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
        at scala.collection.AbstractIterator.to(Iterator.scala:1157)
        at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
        at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
        at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
        at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
        at org.cloudera.spark.streaming.kafka.RDDKafkaWriter$$anonfun$writeToKafka$1.apply(RDDKafkaWriter.scala:47)
        at org.cloudera.spark.streaming.kafka.RDDKafkaWriter$$anonfun$writeToKafka$1.apply(RDDKafkaWriter.scala:45)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:898)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:898)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1848)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
        at org.apache.spark.scheduler.Task.run(Task.scala:88)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
        at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
        at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
        at scala.Option.foreach(Option.scala:236)
        at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
        at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1822)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1835)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1848)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1919)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:898)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:896)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
        at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:896)
4

0 回答 0