我正在处理一个政治竞选捐款数据集,该数据集最终成为一个大约 500mb 的 JSON 文件(最初是一个 124mb 的 CSV)。它太大了,无法在 Firebase 网络界面中导入(在 Google Chrome 上的标签崩溃之前尝试)。我尝试手动上传对象,因为它们是由 CSV 制作的(使用 CSVtoJSON 转换器,每一行都变成一个 JSON 对象,然后我会在它们到来时将该对象上传到 Firebase)。
这是我使用的代码。
var firebase = require('firebase');
var Converter = require("csvtojson").Converter;
firebase.initializeApp({
serviceAccount: "./credentials.json",
databaseURL: "url went here"
});
var converter = new Converter({
constructResult:false,
workerNum:4
});
var db = firebase.database();
var ref = db.ref("/");
var lastindex = 0;
var count = 0;
var section = 0;
var sectionRef;
converter.on("record_parsed",function(resultRow,rawRow,rowIndex){
if (rowIndex >= 0) {
sectionRef = ref.child("reports" + section);
var reportRef = sectionRef.child(resultRow.Report_ID);
reportRef.set(resultRow);
console.log("Report uploaded, count at " + count + ", section at " + section);
count += 1;
lastindex = rowIndex;
if (count >= 1000) {
count = 0;
section += 1;
}
if (section >= 100) {
console.log("last completed index: " + lastindex);
process.exit();
}
} else {
console.log("we out of indices");
process.exit();
}
});
var readStream=require("fs").createReadStream("./vUPLOAD_MASTER.csv");
readStream.pipe(converter);
但是,这遇到了内存问题,无法完成数据集。尝试分块进行也是不可行的,因为 Firebase 没有显示所有上传的数据,而且我不确定我从哪里停下来。(在 Chrome 中打开 Firebase 数据库时,我会看到数据进入,但最终选项卡会崩溃,并且在重新加载后很多数据都丢失了。)
然后我尝试使用Firebase Streaming Import,但是会引发此错误:
started at 1469471482.77
Traceback (most recent call last):
File "import.py", line 90, in <module>
main(argParser.parse_args())
File "import.py", line 20, in main
for prefix, event, value in parser:
File "R:\Python27\lib\site-packages\ijson\common.py", line 65, in parse
for event, value in basic_events:
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 185, in basic_parse
for value in parse_value(lexer):
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 127, in parse_value
raise UnexpectedSymbol(symbol, pos)
ijson.backends.python.UnexpectedSymbol: Unexpected symbol u'\ufeff' at 0
查找最后一行(来自 ijson 的错误),我找到了这个 SO thread,但我只是不确定我应该如何使用它来让 Firebase Streaming Import 工作。
我使用 Vim 从尝试上传的 JSON 文件中删除了字节顺序标记,现在在运行导入器大约一分钟后出现此错误:
Traceback (most recent call last):
File "import.py", line 90, in <module>
main(argParser.parse_args())
File "import.py", line 20, in main
for prefix, event, value in parser:
File "R:\Python27\lib\site-packages\ijson\common.py", line 65, in parse
for event, value in basic_events:
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 185, in basic_parse
for value in parse_value(lexer):
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 116, in parse_value
for event in parse_array(lexer):
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 138, in parse_array
for event in parse_value(lexer, symbol, pos):
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 119, in parse_value
for event in parse_object(lexer):
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 170, in parse_object
pos, symbol = next(lexer)
File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 51, in Lexer
buf += data
MemoryError
Firebase Streaming Importer 应该能够处理超过 250mb 的文件,而且我相当确定我有足够多的 RAM 来处理这个文件。关于为什么会出现此错误的任何想法?
如果看到我尝试使用 Firebase Streaming Import 上传的实际 JSON 文件会有帮助,这里是.