StreamSets Data Collector 中的 Groovy 脚本执行速度比 JavaScript 快得多,因此这里是 Groovy 中的相同解决方案。
使用文本/text
格式,目录源将为每行输入创建一个包含一个字段的记录。
此脚本将构建您需要的记录结构:
for (i = 0; i < records.size(); i++) {
try {
// Start of new input record
if (records[i].value['text'].trim() == "{") {
// Use starting input record as output record
// Save in state so it persists across batches
state['outRecord'] = records[i]
// Clean out the value
state['outRecord'].value = [:]
// Move to next line
i++
// Read values to end of input record
while (i < records.size() && records[i].value['text'].trim() != "}") {
// Split the input line on '='
def kv = records[i].value['text'].trim().split('=')
// Check that there is something after the '='
if (kv.length == 2) {
state['outRecord'].value[kv[0]] = kv[1]
} else if (kv[0].length() > 0) {
state['outRecord'].value[kv[0]] = NULL_STRING
}
// Move to next line of input
i++
}
// Did we hit the '}' before the end of the batch?
if (i < records.size()) {
// Write record to processor output
output.write(state['outRecord'])
log.debug('Wrote a record with {} fields',
state['outRecord'].value.size());
state['outRecord'] = null;
}
}
} catch (e) {
// Write a record to the error pipeline
log.error(e.toString(), e)
error.write(records[i], e.toString())
}
}
在输入数据上运行:
{
1=959450992837
2=95973085229
3=1525785953
4=29
7=2
8=
9=
16=abd
20=def
21=ghi;jkl
22=a@b.com
23=1525785953
40=95973085229
41=959450992837
42=0
43=0
44=0
45=0
74=1
96=1
98=4
99=3
}
给出输出:
{
"1": "959450992837",
"2": "95973085229",
"3": "1525785953",
"4": "29",
"7": "2",
"8": null,
"9": null,
"16": "abd",
"20": "def",
"21": "ghi;jkl",
"22": "a@b.com",
"23": "1525785953",
"40": "95973085229",
"41": "959450992837",
"42": "0",
"43": "0",
"44": "0",
"45": "0",
"74": "1",
"96": "1",
"98": "4",
"99": "3"
}