我尝试执行代码,但出现以下错误: java.lang.OutOfMemoryError: Java heap space org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 4
该代码可以在小文件(一些 kb)上执行,但在“大”文件(5mb)上我得到错误。我尝试增加 VM 内存和 spark.driver.memory 但我再次遇到相同的错误。
parkConf sparkConf = new SparkConf().setAppName("aName");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaRDD<String> lines = sc.textFile(args[0]);
JavaPairRDD<String, String> edges = lines.flatMapToPair(t -> {
List<Tuple2<String,String>> result = new ArrayList<>();
if(!t.contains("#")) {
String [] nodes = SPACE.split(t);
if(Long.parseLong(nodes[0])<Long.parseLong(nodes[1])) {
result.add(new Tuple2<>(nodes[0], nodes[1]));
} else {
result.add(new Tuple2<>(nodes[1], nodes[0]));
}
}
return result.iterator();
});
JavaPairRDD<String, String> edgesReverse = edges.mapToPair(t -> {
return new Tuple2<>(t._2(), t._1());
});
JavaPairRDD<String, Tuple2<String, String>> rdd1 = edges.join(edgesReverse);
JavaPairRDD<String, Tuple2<String, String>> rdd2 = edges.join(edges);
JavaPairRDD<String, Tuple2<String, String>> allRDD = rdd1.union(rdd2).distinct();
JavaPairRDD<Tuple2<String, String>,Double> commonNeighbors = allRDD.mapToPair(t -> {
if(Long.parseLong(t._2()._1())<Long.parseLong(t._2()._2())) {
return new Tuple2<>(t._2()._1(),t._2()._2());
} else {
return new Tuple2<>(t._2()._2(),t._2()._1());
}
}).subtract(edges).mapToPair(t->{
return new Tuple2<>(t,Double.parseDouble("1"));
}).reduceByKey((a,b)->a+b).mapToPair(t -> {
return new Tuple2<>(t._2(),t._1());
}).sortByKey(false).mapToPair(t -> {
return new Tuple2<>(t._2(),t._1());
});
commonNeighbors.saveAsTextFile(args[1]);