1

i am doing a data migration and i am doing the code below in several places. it was migrated some what from some bash stuff so it works, but wondering if it's more efficient to be using the mysql module, thanks.

    p1 = Popen(["echo", query], stdout=PIPE, close_fds=True)

    p2 = Popen(["mysql", "--host=host", "--port=3999",
        "-u", "user", "--password=foo", "--protocol=TCP"],
        stdin=p1.stdout, stdout=PIPE, close_fds=True)

    p1.stdout.close()

`

p1 = Popen(["echo", "show columns from %s" % (table)], stdout=PIPE,
        close_fds=True)

p2 = Popen(["mysql", "--host=host", "--port=3999",
    "-u", "user", "--password=foo", "--protocol=TCP"],
    stdin=p1.stdout, stdout=PIPE, close_fds=True)

p3 = Popen(["awk", "{print $1}"], stdin=p2.stdout, stdout=PIPE,
        close_fds=True)
4

1 回答 1

3

这取决于你想要做什么。如果您不需要在 Python 中处理结果,那么实际上使用 subprocess 可能会更快。

在此处输入图像描述

在此图上,y 轴以秒为单位,x 轴表示所选数据的行数。

请注意,比较并不完全公平: using_subprocess返回一个字符串,而using_mysqldb返回一个元组列表。Python 创建这些 Python 对象所花费的额外时间肯定至少部分解释了速度上的差异。


没有令人信服的理由在 Python 中使用subprocess. 你最好只写一个shell脚本。

如果您需要使用 Python 来处理返回的字符串,那么您可以使用sh 模块mysql使代码更具可读性,而不是.subprocess


import config
import subprocess
import shlex
import timeit
import MySQLdb
import collections
import matplotlib.pyplot as plt

Popen = subprocess.Popen
PIPE = subprocess.PIPE
sql = 'select * from table limit {n}'

def using_subprocess(n):
    p1 = Popen(
        shlex.split(
            'echo {s}'.format(s=sql.format(n=n))), stdout=PIPE, close_fds=True)

    p2 = Popen(
        shlex.split(
            'mysql --host={h} -u {u} --password={p} --database={d}'.format(
                h=config.HOST,
                u=config.USER,
                p=config.PASS,
                d=config.MYDB
                )),
        stdin=p1.stdout, stdout=PIPE, close_fds=True)

    p1.stdout.close()
    out, err = p2.communicate()
    return out

def using_mysqldb(n):
    connection = MySQLdb.connect(
        host = config.HOST, user = config.USER,
        passwd = config.PASS, db = config.MYDB)
    cursor = connection.cursor()

    cursor.execute(sql.format(n=n))
    rows = cursor.fetchall()
    return rows

times = collections.defaultdict(list)
ns = [10**i for i in range(5)]
for n in ns:
    times['using_mysqldb'].append(
        timeit.timeit('m.using_mysqldb({n})'.format(n=n),
                      'import __main__ as m',
                      number = 10))
    times['using_subprocess'].append(
        timeit.timeit('m.using_subprocess({n})'.format(n=n),
                      'import __main__ as m',
                      number = 10))

for name, time in times.iteritems():
    plt.plot(ns, time, label=name)
    # print('{n}: {t}'.format(n=name, t=time))
plt.legend(loc='best')
plt.show()    
于 2013-03-17T03:13:56.230 回答