1

我开始怀疑pandas.json_normalize我正在分析的生产环境中的运行速度有多快,所以我编写了一个玩具程序来将其与一些手动制作的for循环解决方案进行比较,我不明白一个更基本(和丑陋)的解决方案要快得多的事实比json_normalize提供的pandas

所以我为实验创建了一大堆嵌套json对象,并比较了三种方法的性能:

  1. 只需将数据提供给pandas.json_normalize
  2. 使用for循环并将数据转换为dict准备好馈送到pandas.DataFrame(). 只转换了部分数据
  3. 与方法 2 相同,但适用于每个字段。

每个json对象的结构都模仿了我目前拥有的生产数据。

这是我的完整程序。

import pandas as pd
from pprint import pprint as pp
import time

# profile pandas json serialization

def _make_large_json_sample(size):
    '''
    the structure would be a list of dictionary. Each dictionary could have
    sub-dictionaries.
    '''

    def _simple_dict(prefix, seed, dsize):

        def _val(prefix, idx, seed):
            if idx in (0, 1): # a datetime value
                return '2021-11-02T05:30:00.%s' % (('%7d' % seed).replace(' ', '0'))
            elif idx in (2, 3): # int
                return int(seed)
            elif idx in (4, 5, 6, 7): # float
                return 10.0*seed
            elif idx == 8: # just a None value
                return None
            else: # string
                return '%s%d for %s' % (prefix, idx, seed)

        return dict(
            (
                '%s%d' % (prefix, _field_idx),
                _val(prefix, _field_idx, seed),
            )
            for _field_idx in range(dsize)
        )

    def _one_element(seed):
        ret = _simple_dict('field', seed, 20)
        ret.update({'outerone' : _simple_dict('innerone', seed, 10)})
        ret.update({'outertwo' : _simple_dict('innertwo', seed, 40)})
        return ret

    return [_one_element(_idx) for _idx in range(size)]


if __name__ == '__main__':
    start = time.time()
    sample = _make_large_json_sample(50000)
    end = time.time()

    print('data generation costs %s seconds' % (end-start)) # around 2.38s
    # pp(sample)

    start = time.time()
    df1 = pd.json_normalize(sample)
    end = time.time()

    print('json normalize costs %s seconds' % (end-start)) # around 8.17s

    # picking fields that are of interest manually
    # say you only need field 1, 2, 3, 4, 5, 10, 11
    # and outerone.interone0-9
    # and outertwo.intertwo0-4, 10-14
    def _extract_simple_key(key):
        return [element[key] for element in sample]

    def _extract_nest_key(key1, key2):
        return [element[key1][key2] for element in sample]

    def _extract_part():
        return {
            'field1' : _extract_simple_key('field1'),
            'field2' : _extract_simple_key('field2'),
            'field3' : _extract_simple_key('field3'),
            'field4' : _extract_simple_key('field4'),
            'field5' : _extract_simple_key('field5'),
            'field10' : _extract_simple_key('field10'),
            'field11' : _extract_simple_key('field11'),

            'outerone.innerone0': _extract_nest_key('outerone', 'innerone0'),
            'outerone.innerone1': _extract_nest_key('outerone', 'innerone1'),
            'outerone.innerone2': _extract_nest_key('outerone', 'innerone2'),
            'outerone.innerone3': _extract_nest_key('outerone', 'innerone3'),
            'outerone.innerone4': _extract_nest_key('outerone', 'innerone4'),
            'outerone.innerone5': _extract_nest_key('outerone', 'innerone5'),
            'outerone.innerone6': _extract_nest_key('outerone', 'innerone6'),
            'outerone.innerone7': _extract_nest_key('outerone', 'innerone7'),
            'outerone.innerone8': _extract_nest_key('outerone', 'innerone8'),
            'outerone.innerone9': _extract_nest_key('outerone', 'innerone9'),

            'outertwo.innertwo0': _extract_nest_key('outertwo', 'innertwo0'),
            'outertwo.innertwo1': _extract_nest_key('outertwo', 'innertwo1'),
            'outertwo.innertwo2': _extract_nest_key('outertwo', 'innertwo2'),
            'outertwo.innertwo3': _extract_nest_key('outertwo', 'innertwo3'),
            'outertwo.innertwo4': _extract_nest_key('outertwo', 'innertwo4'),
            'outertwo.innertwo10': _extract_nest_key('outertwo', 'innertwo10'),
            'outertwo.innertwo11': _extract_nest_key('outertwo', 'innertwo11'),
            'outertwo.innertwo12': _extract_nest_key('outertwo', 'innertwo12'),
            'outertwo.innertwo13': _extract_nest_key('outertwo', 'innertwo13'),
            'outertwo.innertwo14': _extract_nest_key('outertwo', 'innertwo14'),
        }
    start = time.time()
    df2 = pd.DataFrame(_extract_part())
    end = time.time()
    print('partial manual normalization costs %s seconds' % (end-start)) # around 0.78s

    def _extract_full():
        return {
            'field0' : _extract_simple_key('field0'),
            'field1' : _extract_simple_key('field1'),
            'field2' : _extract_simple_key('field2'),
            'field3' : _extract_simple_key('field3'),
            'field4' : _extract_simple_key('field4'),
            'field5' : _extract_simple_key('field5'),
            'field6' : _extract_simple_key('field6'),
            'field7' : _extract_simple_key('field7'),
            'field8' : _extract_simple_key('field8'),
            'field9' : _extract_simple_key('field9'),
            'field10' : _extract_simple_key('field10'),
            'field11' : _extract_simple_key('field11'),
            'field12' : _extract_simple_key('field12'),
            'field13' : _extract_simple_key('field13'),
            'field14' : _extract_simple_key('field14'),
            'field15' : _extract_simple_key('field15'),
            'field16' : _extract_simple_key('field16'),
            'field17' : _extract_simple_key('field17'),
            'field18' : _extract_simple_key('field18'),
            'field19' : _extract_simple_key('field19'),

            'outerone.innerone0': _extract_nest_key('outerone', 'innerone0'),
            'outerone.innerone1': _extract_nest_key('outerone', 'innerone1'),
            'outerone.innerone2': _extract_nest_key('outerone', 'innerone2'),
            'outerone.innerone3': _extract_nest_key('outerone', 'innerone3'),
            'outerone.innerone4': _extract_nest_key('outerone', 'innerone4'),
            'outerone.innerone5': _extract_nest_key('outerone', 'innerone5'),
            'outerone.innerone6': _extract_nest_key('outerone', 'innerone6'),
            'outerone.innerone7': _extract_nest_key('outerone', 'innerone7'),
            'outerone.innerone8': _extract_nest_key('outerone', 'innerone8'),
            'outerone.innerone9': _extract_nest_key('outerone', 'innerone9'),

            'outertwo.innertwo0': _extract_nest_key('outertwo', 'innertwo0'),
            'outertwo.innertwo1': _extract_nest_key('outertwo', 'innertwo1'),
            'outertwo.innertwo2': _extract_nest_key('outertwo', 'innertwo2'),
            'outertwo.innertwo3': _extract_nest_key('outertwo', 'innertwo3'),
            'outertwo.innertwo4': _extract_nest_key('outertwo', 'innertwo4'),
            'outertwo.innertwo5': _extract_nest_key('outertwo', 'innertwo5'),
            'outertwo.innertwo6': _extract_nest_key('outertwo', 'innertwo6'),
            'outertwo.innertwo7': _extract_nest_key('outertwo', 'innertwo7'),
            'outertwo.innertwo8': _extract_nest_key('outertwo', 'innertwo8'),
            'outertwo.innertwo9': _extract_nest_key('outertwo', 'innertwo9'),
            'outertwo.innertwo10': _extract_nest_key('outertwo', 'innertwo10'),
            'outertwo.innertwo11': _extract_nest_key('outertwo', 'innertwo11'),
            'outertwo.innertwo12': _extract_nest_key('outertwo', 'innertwo12'),
            'outertwo.innertwo13': _extract_nest_key('outertwo', 'innertwo13'),
            'outertwo.innertwo14': _extract_nest_key('outertwo', 'innertwo14'),
            'outertwo.innertwo15': _extract_nest_key('outertwo', 'innertwo15'),
            'outertwo.innertwo16': _extract_nest_key('outertwo', 'innertwo16'),
            'outertwo.innertwo17': _extract_nest_key('outertwo', 'innertwo17'),
            'outertwo.innertwo18': _extract_nest_key('outertwo', 'innertwo18'),
            'outertwo.innertwo19': _extract_nest_key('outertwo', 'innertwo19'),
            'outertwo.innertwo20': _extract_nest_key('outertwo', 'innertwo20'),
            'outertwo.innertwo21': _extract_nest_key('outertwo', 'innertwo21'),
            'outertwo.innertwo22': _extract_nest_key('outertwo', 'innertwo22'),
            'outertwo.innertwo23': _extract_nest_key('outertwo', 'innertwo23'),
            'outertwo.innertwo24': _extract_nest_key('outertwo', 'innertwo24'),
            'outertwo.innertwo25': _extract_nest_key('outertwo', 'innertwo25'),
            'outertwo.innertwo26': _extract_nest_key('outertwo', 'innertwo26'),
            'outertwo.innertwo27': _extract_nest_key('outertwo', 'innertwo27'),
            'outertwo.innertwo28': _extract_nest_key('outertwo', 'innertwo28'),
            'outertwo.innertwo29': _extract_nest_key('outertwo', 'innertwo29'),
            'outertwo.innertwo30': _extract_nest_key('outertwo', 'innertwo30'),
            'outertwo.innertwo31': _extract_nest_key('outertwo', 'innertwo31'),
            'outertwo.innertwo32': _extract_nest_key('outertwo', 'innertwo32'),
            'outertwo.innertwo33': _extract_nest_key('outertwo', 'innertwo33'),
            'outertwo.innertwo34': _extract_nest_key('outertwo', 'innertwo34'),
            'outertwo.innertwo35': _extract_nest_key('outertwo', 'innertwo35'),
            'outertwo.innertwo36': _extract_nest_key('outertwo', 'innertwo36'),
            'outertwo.innertwo37': _extract_nest_key('outertwo', 'innertwo37'),
            'outertwo.innertwo38': _extract_nest_key('outertwo', 'innertwo38'),
            'outertwo.innertwo39': _extract_nest_key('outertwo', 'innertwo39'),
        }

    start = time.time()
    df3 = pd.DataFrame(_extract_full())
    end = time.time()
    print('full manual normalization costs %s seconds' % (end-start)) # around 2s

列表中的示例对象如下所示:

>>> pp(sample[1])
{'field0': '2021-11-02T05:30:00.0000001',
 'field1': '2021-11-02T05:30:00.0000001',
 'field10': 'field10 for 1',
 'field11': 'field11 for 1',
 'field12': 'field12 for 1',
 'field13': 'field13 for 1',
 'field14': 'field14 for 1',
 'field15': 'field15 for 1',
 'field16': 'field16 for 1',
 'field17': 'field17 for 1',
 'field18': 'field18 for 1',
 'field19': 'field19 for 1',
 'field2': 1,
 'field3': 1,
 'field4': 10.0,
 'field5': 10.0,
 'field6': 10.0,
 'field7': 10.0,
 'field8': None,
 'field9': 'field9 for 1',
 'outerone': {'innerone0': '2021-11-02T05:30:00.0000001',
              'innerone1': '2021-11-02T05:30:00.0000001',
              'innerone2': 1,
              'innerone3': 1,
              'innerone4': 10.0,
              'innerone5': 10.0,
              'innerone6': 10.0,
              'innerone7': 10.0,
              'innerone8': None,
              'innerone9': 'innerone9 for 1'},
 'outertwo': {'innertwo0': '2021-11-02T05:30:00.0000001',
              'innertwo1': '2021-11-02T05:30:00.0000001',
              'innertwo10': 'innertwo10 for 1',
              'innertwo11': 'innertwo11 for 1',
              'innertwo12': 'innertwo12 for 1',
              'innertwo13': 'innertwo13 for 1',
              'innertwo14': 'innertwo14 for 1',
              'innertwo15': 'innertwo15 for 1',
              'innertwo16': 'innertwo16 for 1',
              'innertwo17': 'innertwo17 for 1',
              'innertwo18': 'innertwo18 for 1',
              'innertwo19': 'innertwo19 for 1',
              'innertwo2': 1,
              'innertwo20': 'innertwo20 for 1',
              'innertwo21': 'innertwo21 for 1',
              'innertwo22': 'innertwo22 for 1',
              'innertwo23': 'innertwo23 for 1',
              'innertwo24': 'innertwo24 for 1',
              'innertwo25': 'innertwo25 for 1',
              'innertwo26': 'innertwo26 for 1',
              'innertwo27': 'innertwo27 for 1',
              'innertwo28': 'innertwo28 for 1',
              'innertwo29': 'innertwo29 for 1',
              'innertwo3': 1,
              'innertwo30': 'innertwo30 for 1',
              'innertwo31': 'innertwo31 for 1',
              'innertwo32': 'innertwo32 for 1',
              'innertwo33': 'innertwo33 for 1',
              'innertwo34': 'innertwo34 for 1',
              'innertwo35': 'innertwo35 for 1',
              'innertwo36': 'innertwo36 for 1',
              'innertwo37': 'innertwo37 for 1',
              'innertwo38': 'innertwo38 for 1',
              'innertwo39': 'innertwo39 for 1',
              'innertwo4': 10.0,
              'innertwo5': 10.0,
              'innertwo6': 10.0,
              'innertwo7': 10.0,
              'innertwo8': None,
              'innertwo9': 'innertwo9 for 1'}}

这是一个典型的输出,如下所示:

(On Ubuntu 20.04, Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz, Pandas 1.0.1)
data generation costs 2.431631326675415 seconds
json normalize costs 8.036803960800171 seconds
partial manual normalization costs 0.791933536529541 seconds
full manual normalization costs 2.0645904541015625 seconds

我尝试在我公司的 Windows 计算机上运行它,我得到了非常相似的结果。它在那里使用了 Pandas 1.1.5。

我检查了结果,这些DataFrame看起来是一致的。如果您能帮助我了解这些方法之间的性能差异,我将不胜感激。

编辑:添加环境细节

4

0 回答 0