python - 如何将xls转换为xlsx

Question

我有一些 *.xls (excel 2003) 文件，我想将这些文件转换为 xlsx (excel 2007)。

我使用uno python包，当我保存文档时，我可以设置过滤器名称：MS Excel 97 但是没有过滤器名称像'MS Excel 2007'，

如何设置过滤器名称以将 xls 转换为 xlsx ？

score 35 · Accepted Answer

你需要在你的机器上安装 win32com。这是我的代码：

import win32com.client as win32
fname = "full+path+to+xls_file"
excel = win32.gencache.EnsureDispatch('Excel.Application')
wb = excel.Workbooks.Open(fname)

wb.SaveAs(fname+"x", FileFormat = 51)    #FileFormat = 51 is for .xlsx extension
wb.Close()                               #FileFormat = 56 is for .xls extension
excel.Application.Quit()

score 25 · Accepted Answer

这是我的解决方案，不考虑字体、图表和图像：

$ pip install pyexcel pyexcel-xls pyexcel-xlsx

然后这样做::

import pyexcel as p

p.save_book_as(file_name='your-file-in.xls',
               dest_file_name='your-new-file-out.xlsx')

如果您不需要程序，您可以安装一个附加包 pyexcel-cli::

$ pip install pyexcel-cli
$ pyexcel transcode your-file-in.xls your-new-file-out.xlsx

上面的转码过程使用 xlrd 和 openpyxl。

score 23 · Accepted Answer

我以前不得不这样做。主要思想是使用xlrd模块打开和解析一个 xls 文件，并使用openpyxl模块将内容写入xlsx文件。

这是我的代码。注意力！它不能处理复杂的 xls 文件，如果你要使用它，你应该添加你自己的解析逻辑。

import xlrd
from openpyxl.workbook import Workbook
from openpyxl.reader.excel import load_workbook, InvalidFileException

def open_xls_as_xlsx(filename):
    # first open using xlrd
    book = xlrd.open_workbook(filename)
    index = 0
    nrows, ncols = 0, 0
    while nrows * ncols == 0:
        sheet = book.sheet_by_index(index)
        nrows = sheet.nrows
        ncols = sheet.ncols
        index += 1

    # prepare a xlsx sheet
    book1 = Workbook()
    sheet1 = book1.get_active_sheet()

    for row in xrange(0, nrows):
        for col in xrange(0, ncols):
            sheet1.cell(row=row, column=col).value = sheet.cell_value(row, col)

    return book1

score 11 · Accepted Answer

我在这里没有找到 100% 正确的答案。所以我在这里发布我的代码：

import xlrd
from openpyxl.workbook import Workbook

def cvt_xls_to_xlsx(src_file_path, dst_file_path):
    book_xls = xlrd.open_workbook(src_file_path)
    book_xlsx = Workbook()

sheet_names = book_xls.sheet_names()
for sheet_index, sheet_name in enumerate(sheet_names):
    sheet_xls = book_xls.sheet_by_name(sheet_name)
    if sheet_index == 0:
        sheet_xlsx = book_xlsx.active
        sheet_xlsx.title = sheet_name
    else:
        sheet_xlsx = book_xlsx.create_sheet(title=sheet_name)

    for row in range(0, sheet_xls.nrows):
        for col in range(0, sheet_xls.ncols):
            sheet_xlsx.cell(row = row+1 , column = col+1).value = sheet_xls.cell_value(row, col)

book_xlsx.save(dst_file_path)

score 7 · Accepted Answer

Ray 的回答对我帮助很大，但对于那些寻找一种简单方法将所有工作表从 xls 转换为 xlsx 的人，我提出了以下要点：

import xlrd
from openpyxl.workbook import Workbook as openpyxlWorkbook

# content is a string containing the file. For example the result of an http.request(url).
# You can also use a filepath by calling "xlrd.open_workbook(filepath)".

xlsBook = xlrd.open_workbook(file_contents=content)
workbook = openpyxlWorkbook()

for i in xrange(0, xlsBook.nsheets):
    xlsSheet = xlsBook.sheet_by_index(i)
    sheet = workbook.active if i == 0 else workbook.create_sheet()
    sheet.title = xlsSheet.name

    for row in xrange(0, xlsSheet.nrows):
        for col in xrange(0, xlsSheet.ncols):
            sheet.cell(row=row, column=col).value = xlsSheet.cell_value(row, col)

# The new xlsx file is in "workbook", without iterators (iter_rows).
# For iteration, use "for row in worksheet.rows:".
# For range iteration, use "for row in worksheet.range("{}:{}".format(startCell, endCell)):".

您可以在此处找到 xlrd 库和此处的 openpyxl （例如，您必须在您的项目中为 Google App Engine 下载 xlrd）。

score 6 · Accepted Answer

我正在提高@Jackypengyu 方法的性能。

XLSX：每行工作，而不是每个单元格（http://openpyxl.readthedocs.io/en/default/api/openpyxl.worksheet.worksheet.html#openpyxl.worksheet.worksheet.Worksheet.append）
XLS：读取整行，不包括空尾，请参阅ragged_rows=True（http://xlrd.readthedocs.io/en/latest/api.html#xlrd.sheet.Sheet.row_slice）

合并的单元格也将被转换。

结果

以相同的顺序转换相同的 12 个文件：

原文：

0:00:01.958159
0:00:02.115891
0:00:02.018643
0:00:02.057803
0:00:01.267079
0:00:01.308073
0:00:01.245989
0:00:01.289295
0:00:01.273805
0:00:01.276003
0:00:01.293834
0:00:01.261401

改进：

0:00:00.774101
0:00:00.734749
0:00:00.741434
0:00:00.744491
0:00:00.320796
0:00:00.279045
0:00:00.315829
0:00:00.280769
0:00:00.316380
0:00:00.289196
0:00:00.347819
0:00:00.284242

解决方案

def cvt_xls_to_xlsx(*args, **kw):
    """Open and convert XLS file to openpyxl.workbook.Workbook object

    @param args: args for xlrd.open_workbook
    @param kw: kwargs for xlrd.open_workbook
    @return: openpyxl.workbook.Workbook


    You need -> from openpyxl.utils.cell import get_column_letter
    """

    book_xls = xlrd.open_workbook(*args, formatting_info=True, ragged_rows=True, **kw)
    book_xlsx = Workbook()

    sheet_names = book_xls.sheet_names()
    for sheet_index in range(len(sheet_names)):
        sheet_xls = book_xls.sheet_by_name(sheet_names[sheet_index])

        if sheet_index == 0:
            sheet_xlsx = book_xlsx.active
            sheet_xlsx.title = sheet_names[sheet_index]
        else:
            sheet_xlsx = book_xlsx.create_sheet(title=sheet_names[sheet_index])

        for crange in sheet_xls.merged_cells:
            rlo, rhi, clo, chi = crange

            sheet_xlsx.merge_cells(
                start_row=rlo + 1, end_row=rhi,
                start_column=clo + 1, end_column=chi,
            )

        def _get_xlrd_cell_value(cell):
            value = cell.value
            if cell.ctype == xlrd.XL_CELL_DATE:
                value = datetime.datetime(*xlrd.xldate_as_tuple(value, 0))

            return value

        for row in range(sheet_xls.nrows):
            sheet_xlsx.append((
                _get_xlrd_cell_value(cell)
                for cell in sheet_xls.row_slice(row, end_colx=sheet_xls.row_len(row))
            ))

        for rowx in range(sheet_xls.nrows):
            if sheet_xls.rowinfo_map[rowx].hidden != 0:
                print sheet_names[sheet_index], rowx
                sheet_xlsx.row_dimensions[rowx+1].hidden = True
        for coly in range(sheet_xls.ncols):
            if sheet_xls.colinfo_map[coly].hidden != 0:
                print sheet_names[sheet_index], coly
                coly_letter = get_column_letter(coly+1)
                sheet_xlsx.column_dimensions[coly_letter].hidden = True

    return book_xlsx

score 5 · Accepted Answer

您可以使用 pandas IO 功能：

import pandas as pd

df = pd.read_excel('file_2003.xls', header=None)
df.to_excel('file_2003.xlsx', index=False, header=False)

score 2 · Accepted Answer

简单的解决方案

我需要一个简单的解决方案来将几个转换xls为xlsx格式。这里有很多答案，但他们正在做一些我不完全理解的“魔术”。

chfw给出了一个简单的解决方案，但并不完全。

安装依赖项

使用 pip 安装

pip install pyexcel-cli pyexcel-xls pyexcel-xlsx

执行

所有样式和宏都将消失，但信息完好无损。

对于单个文件

pyexcel transcode your-file-in.xls your-new-file-out.xlsx

对于文件夹中的所有文件，一个衬垫

for file in *.xls; do; echo "Transcoding $file"; pyexcel transcode "$file" "${file}x"; done;

score 2 · Accepted Answer

我尝试了@Jhon Anderson 的解决方案，效果很好，但是当存在没有日期的 HH:mm:ss 等时间格式的单元格时出现“年份超出范围”错误。在那里我再次改进了算法：

def xls_to_xlsx(*args, **kw):
"""
    open and convert an XLS file to openpyxl.workbook.Workbook
    ----------
    @param args: args for xlrd.open_workbook
    @param kw: kwargs for xlrd.open_workbook
    @return: openpyxl.workbook.Workbook对象
    """
    book_xls = xlrd.open_workbook(*args, formatting_info=True, ragged_rows=True, **kw)
    book_xlsx = openpyxl.workbook.Workbook()

    sheet_names = book_xls.sheet_names()
    for sheet_index in range(len(sheet_names)):
        sheet_xls = book_xls.sheet_by_name(sheet_names[sheet_index])
        if sheet_index == 0:
            sheet_xlsx = book_xlsx.active
            sheet_xlsx.title = sheet_names[sheet_index]
        else:
            sheet_xlsx = book_xlsx.create_sheet(title=sheet_names[sheet_index])
        for crange in sheet_xls.merged_cells:
            rlo, rhi, clo, chi = crange
            sheet_xlsx.merge_cells(start_row=rlo + 1, end_row=rhi,
            start_column=clo + 1, end_column=chi,)

        def _get_xlrd_cell_value(cell):
            value = cell.value
            if cell.ctype == xlrd.XL_CELL_DATE:
                datetime_tup = xlrd.xldate_as_tuple(value,0)    
                if datetime_tup[0:3] == (0, 0, 0):   # time format without date
                    value = datetime.time(*datetime_tup[3:])
                else:
                    value = datetime.datetime(*datetime_tup)
            return value

        for row in range(sheet_xls.nrows):
            sheet_xlsx.append((
                _get_xlrd_cell_value(cell)
                for cell in sheet_xls.row_slice(row, end_colx=sheet_xls.row_len(row))
            ))
    return book_xlsx

然后完美工作！

score 1 · Accepted Answer

将 XLS 文件转换为 XLSX

使用python3.6我刚刚遇到同样的问题，经过数小时的努力，我通过做ff解决了它，你可能不需要所有的包：（我会像posslbe一样清楚）

确保在继续之前安装以下软件包

点安装 pyexcel，点安装 pyexcel-xls，点安装 pyexcel-xlsx，

点安装 pyexcel-cli

步骤1：

import pyexcel

第 2 步：“example.xls”、“example.xlsx”、“example.xlsm”

sheet0 = pyexcel.get_sheet(file_name="your_file_path.xls", name_columns_by_row=0)

step3：从内容创建数组

xlsarray = sheet.to_array()

step4：检查变量内容以验证

xlsarray

第 5 步：将保存在名为 (xlsarray) 的变量中的数组传递给名为 (sheet1) 的新工作簿变量

sheet1 = pyexcel.Sheet(xlsarray)

第6步：保存以.xlsx结尾的新工作表（在我的情况下我想要xlsx）

sheet1.save_as("test.xlsx")

score 1 · Accepted Answer

好吧，我保持简单并尝试使用 Pandas：

import pandas as pd

df = pd.read_excel (r'Path_of_your_file\\name_of_your_file.xls')

df.to_excel(r'Output_path\\new_file_name.xlsx', index = False)

score 0 · Accepted Answer

@CaKel 和 @Jhon Anderson 解决方案：

def _get_xlrd_cell_value(cell):
    value = cell.value
        if cell.ctype == xlrd.XL_CELL_DATE:
            # Start: if time is 00:00 this fix is necessary
            if value == 1.0:
                datetime_tup = (0, 0, 0)
            else:
            # end
                datetime_tup = xlrd.xldate_as_tuple(value, 0)

            if datetime_tup[0:3] == (0, 0, 0):
                value = datetime.time(*datetime_tup[3:])
            else:
                value = datetime.datetime(*datetime_tup)
    return value

现在这段代码对我来说运行完美！

score 0 · Accepted Answer

第一次尝试@Jhon的解决方案，然后我变成了pyexcel作为解决方案

pyexcel.save_as(file_name=oldfilename, dest_file_name=newfilename)

在我尝试通过 PyInstaller 将我的项目打包到单个 exe 文件之前，它可以正常工作，我尝试了所有隐藏的导入选项，但仍然存在以下错误：

  File "utils.py", line 27, in __enter__
    pyexcel.save_as(file_name=self.filename, dest_file_name=newfilename)
  File "site-packages\pyexcel\core.py", line 77, in save_as
  File "site-packages\pyexcel\internal\core.py", line 22, in get_sheet_stream
  File "site-packages\pyexcel\plugins\sources\file_input.py", line 39, in get_da
ta
  File "site-packages\pyexcel\plugins\parsers\excel.py", line 19, in parse_file
  File "site-packages\pyexcel\plugins\parsers\excel.py", line 40, in _parse_any
  File "site-packages\pyexcel_io\io.py", line 73, in get_data
  File "site-packages\pyexcel_io\io.py", line 91, in _get_data
  File "site-packages\pyexcel_io\io.py", line 188, in load_data
  File "site-packages\pyexcel_io\plugins.py", line 90, in get_a_plugin
  File "site-packages\lml\plugin.py", line 290, in load_me_now
  File "site-packages\pyexcel_io\plugins.py", line 107, in raise_exception
pyexcel_io.exceptions.SupportingPluginAvailableButNotInstalled: Please install p
yexcel-xls
[3192] Failed to execute script

然后，我跳到pandas：

pd.read_excel(oldfilename).to_excel(newfilename, sheet_name=self.sheetname,index=False)

2020 年 2 月 21 日更新

openpyxl提供了函数：append

启用向 xlxs 文件插入行的能力，这意味着用户可以从 xls 文件中读取数据并将它们插入到 xlsx 文件中。

append(['这是 A1', '这是 B1', '这是 C1'])
或 append({'A' : '这是 A1', 'C' : '这是 C1'})
或 append({1 : '这是 A1', 3 : '这是 C1'})

在当前工作表的底部附加一组值：

如果是列表：所有值按顺序添加，从第一列开始
如果它是一个字典：值被分配给由键（数字或字母）指示的列

score 0 · Accepted Answer

使用win32com（pywin32）作为@kvdogan的答案主要是完美的方法。一些先决条件：

使用 Windows（显然）；
已安装 MSExcel；
确保文件的路径是完整路径；

此外，Pywin32 项目在 SourceForge 上不是最新的。相反，使用 github：https ://github.com/mhammond/pywin32 有一个 .chm 文档，您可以使用 SumatraPDF 阅读，例如，在安装后的项目文件夹中。

#My answer contains no code at all.

编辑：我没有足够的声誉发表评论。我为实际的洪水感到抱歉。

score 0 · Accepted Answer

这是适用于具有旧 xls 文件（例如 Excel 97 2004）的 MacOS 的解决方案。

如果不能选择 excel，我发现处理这种格式的最佳方法是在 openoffice 中打开文件并将其另存为 csv 文件格式。

score 0 · Accepted Answer

Ray 的回答是裁剪数据的第一行和最后一列。这是我修改后的解决方案（适用于 python3）：

def open_xls_as_xlsx(filename):
# first open using xlrd
book = xlrd.open_workbook(filename)
index = 0
nrows, ncols = 0, 0
while nrows * ncols == 0:
    sheet = book.sheet_by_index(index)
    nrows = sheet.nrows+1   #bm added +1
    ncols = sheet.ncols+1   #bm added +1
    index += 1

# prepare a xlsx sheet
book1 = Workbook()
sheet1 = book1.get_active_sheet()

for row in range(1, nrows):
    for col in range(1, ncols):
        sheet1.cell(row=row, column=col).value = sheet.cell_value(row-1, col-1) #bm added -1's

return book1

score 0 · Accepted Answer

尝试使用 win32com 应用程序。将其安装在您的机器上。

import sys, os
import win32com.client
directory = 'C:\\Users\\folder\\'
for file in os.listdir(directory):
    dot = file.find('.')
    end = file[dot:]
    OutFile =file[0:dot] + ".xlsx"
    App = win32com.client.Dispatch("Excel.Application")
    App.Visible = True
    workbook= App.Workbooks.Open(file)
    workbook.ActiveSheet.SaveAs(OutFile, 51)   #51 is for xlsx 
    workbook.Close(SaveChanges=True)
    App.Quit()

谢谢你。