我有一个雪管。我无法控制传入的文件。每个文件都包含一个相对较小的 json 记录。
当我由于错误重新启动管道时,该阶段大约有 38000 个文件。
我目前每分钟只处理 6-8 个文件。编辑:更正,我在 55 分钟内处理了大约 750 个文件。还是很慢。
如何在不更改每个文件约束的一条记录的情况下加快速度?
{
"id": "55555555-5555-5555-5555-555555555555",
"val": {
"bank": "bank name",
"browser_version": "1.7.4044.138",
"cpu": "Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz",
"cpu_architecture": "x86_64",
"created_at": "2020-07-31T10:21:12.992345225Z",
"current_address": "address",
"current_city": null,
"current_country": "my country",
"date_of_birth": "1990-03-21",
"document_number": "55-55-55-55555",
"document_type": "this type of document",
"email": "some.email@email.com",
"full_name": "some name",
"gender": "Male",
"internal_storage": "237.87 GB",
"joined_date": "2020-07-31",
"last_logged_in": null,
"last_worked_on": "2020-12-02",
"latitude": "Not Found",
"longitude": "Not Found",
"memory": "7.74 GB",
"number_of_processors": "8",
"app_client_uid": "{55555555-5LLL-5555-55L5-555B5555DB5A}",
"operating_system": "win 10.0",
"other_display": null,
"id_number": "555555555",
"permanent_address": null,
"permanent_city": null,
"permanent_country": null,
"personal_email": "anotheremail@gmail.com",
"primary_display": "1920x1080",
"role_id": "55555555-5555-5555-5555-555555555555",
"source_system": [
{
"id": "astringidthiny",
"system": "system_name"
}
],
"status": "Active",
"type": "worker",
"updated_at": "2020-12-02T08:32:56Z",
"webcam": "true",
"workstream_ids": [
"55555555-5555-5555-5555-555555555555"
]
}
}