我有一个用例,我想在 HDFS 上上传大的 gzip 压缩文本数据文件(~ 60 GB)。
我下面的代码大约需要 2 个小时才能以 500 MB 的块上传这些文件。以下是伪代码。我正在检查是否有人可以帮助我减少这个时间:
i) int fileFetchBuffer = 500000000; System.out.println("文件获取缓冲区为:" + fileFetchBuffer); 整数偏移 = 0;int bytesRead = -1;
try {
fileStream = new FileInputStream (file);
if (fileName.endsWith(".gz")) {
stream = new GZIPInputStream(fileStream);
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String[] fileN = fileName.split("\\.");
System.out.println("fil 0 : " + fileN[0]);
System.out.println("fil 1 : " + fileN[1]);
//logger.info("First line is: " + streamBuff.readLine());
byte[] buffer = new byte[fileFetchBuffer];
FileSystem fs = FileSystem.get(conf);
int charsLeft = fileFetchBuffer;
while (true) {
charsLeft = fileFetchBuffer;
logger.info("charsLeft outside while: " + charsLeft);
FSDataOutputStream dos = null;
while (charsLeft != 0) {
bytesRead = stream.read(buffer, 0, charsLeft);
if (bytesRead < 0) {
dos.flush();
dos.close();
break;
}
offset = offset + bytesRead;
charsLeft = charsLeft - bytesRead;
logger.info("offset in record: " + offset);
logger.info("charsLeft: " + charsLeft);
logger.info("bytesRead in record: " + bytesRead);
//prettyPrintHex(buffer);
String outFileStr = Utils.getOutputFileName(
stagingDir,
fileN[0],
outFileNum);
if (dos == null) {
Path outFile = new Path(outFileStr);
if (fs.exists(outFile)) {
fs.delete(outFile, false);
}
dos = fs.create(outFile);
}
dos.write(buffer, 0, bytesRead);
}
logger.info("done writing: " + outFileNum);
dos.flush();
dos.close();
if (bytesRead < 0) {
dos.flush();
dos.close();
break;
}
outFileNum++;
} // end of if
} else {
// Assume uncompressed file
stream = fileStream;
}
} catch(FileNotFoundException e) {
logger.error("File not found" + e);
}