原初衷是为了将excel数据入库,在这之前需要将数据提取出来。
使用openpyxl包,提取xslx格式的excel文件,再读取行列单元格的值,存为列表包字典的数据列表,最后存为json格式。
import json
import openpyxl
import logging
import threading
from concurrent.futures import ThreadPoolExecutor
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def process_row(row, headers, row_num):
item = {}
for i, value in enumerate(row):
item[headers[i]] = value
logging.info('Thread: %s, Row: %s', threading.current_thread().name, row_num)
return item
def row_to_dict(row, headers, row_num):
return process_row(row, headers, row_num)
def excel_to_json(file_path):
# 打开 Excel 文件
wb = openpyxl.load_workbook(file_path)
# 获取第一个工作表
ws = wb.active
# 获取表头
headers = [cell.value for cell in ws[1]]
# 构建 JSON 数据
data = []
with ThreadPoolExecutor(max_workers=10) as executor:
rows = ws.iter_rows(min_row=2, values_only=True)
for i, item in enumerate(executor.map(row_to_dict, rows, [headers]*ws.max_row, range(2, ws.max_row+1))):
data.append(item)
# 将 JSON 数据写入文件
with open('result.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
file_name = 'source/补齐企业工商信息0522.xlsx'
excel_to_json(file_name)