From 363a177559690688f12f7f3a3d68ced2b2628032 Mon Sep 17 00:00:00 2001 From: "shimoda.m@nds-tyo.co.jp" Date: Tue, 10 Jun 2025 16:13:37 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20gz=E8=A7=A3=E5=87=8D=E3=81=AB=E5=AF=BE?= =?UTF-8?q?=E5=BF=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ecs/dataimport/dataimport/chk.py | 27 ++++++++------- ecs/dataimport/dataimport/common.py | 53 +++++++++++++++++++++++++++-- ecs/dataimport/dataimport/main.py | 6 ++-- 3 files changed, 69 insertions(+), 17 deletions(-) diff --git a/ecs/dataimport/dataimport/chk.py b/ecs/dataimport/dataimport/chk.py index 7b8b3ba6..de3feca6 100644 --- a/ecs/dataimport/dataimport/chk.py +++ b/ecs/dataimport/dataimport/chk.py @@ -6,7 +6,8 @@ from datetime import datetime import boto3 from common import (ERROR, INFO, LINE_FEED_CODE, SETTINGS_ITEM, - convert_quotechar, debug_log, uncompress_zip) + convert_quotechar, debug_log, uncompress_gzip, + uncompress_zip) from end import end from error import error @@ -181,7 +182,7 @@ def is_empty_file(work_csv_row: list, settings_list: list): return len(work_csv_row) == 0 -def uncompress_file(work_data_file: bytes, settings_list: list, log_info) -> bytes: +def uncompress_file(work_data_file: io.BytesIO, settings_list: list, log_info) -> bytes: """指定された形式で圧縮ファイルを展開し、ローカルに書き出す。 Args: work_data_file (bytes): S3から読み込んだ登録 @@ -198,7 +199,9 @@ def uncompress_file(work_data_file: bytes, settings_list: list, log_info) -> byt file_bytes: bytes = None if compression == 'zip': # zipファイルを展開し、ファイルを書き出す。 - file_bytes = uncompress_zip() + file_bytes = uncompress_zip(work_data_file, settings_list, log_info) + elif compression == 'gzip': + file_bytes = uncompress_gzip(work_data_file, settings_list, log_info) # MEMO: zip以外の圧縮形式に対応する際に追記すること else: @@ -259,12 +262,12 @@ def reverse_readline_stream(f: io.BytesIO, line_feed: str, chunk_size=4096): # ローカル実行用コード # 値はよしなに変えてください -# if __name__ == '__main__': -# check( -# bucket_name='バケット名', -# target_data_source='データソース名', -# target_file_name='targetフォルダ内のファイル名', -# settings_key='個別設定ファイル名', -# log_info='Info', -# mode='i' -# ) +if __name__ == '__main__': + check( + bucket_name='test-shimoda-bucket', + target_data_source='test', + target_file_name='compress_test.gz', + settings_key='test/settings/compress_test_gzip.txt', + log_info='Info', + mode='i' + ) diff --git a/ecs/dataimport/dataimport/common.py b/ecs/dataimport/dataimport/common.py index 22adf07d..0afea617 100644 --- a/ecs/dataimport/dataimport/common.py +++ b/ecs/dataimport/dataimport/common.py @@ -1,4 +1,5 @@ import csv +import gzip import io import zipfile from datetime import datetime @@ -92,11 +93,11 @@ def convert_quotechar(quotechar): return quotechar -def uncompress_zip(work_data_file: bytes, settings_list: list, log_info) -> bytes: +def uncompress_zip(work_data_file: io.BytesIO, settings_list: list, log_info) -> bytes: """zip圧縮されているファイルを展開する Args: - work_data_file (bytes): 展開対象のバイナリ + work_data_file (io.BytesIO): 展開対象のバイナリ(.zip) settings_list (list): 個別設定ファイルの情報 log_info (_type_): ログ情報 @@ -144,3 +145,51 @@ def uncompress_zip(work_data_file: bytes, settings_list: list, log_info) -> byte delimiter=delimiter) for row in csv_reader: csv_writer.writerow(row) + + +def uncompress_gzip(work_data_file: io.BytesIO, settings_list: list, log_info) -> bytes: + """gzip 圧縮されたファイルを展開する + + Args: + work_data_file (io.BytesIO): 展開対象のバイナリ (.gz) + settings_list (list): 個別設定ファイルの情報 + log_info (_type_): ログ情報 + + Raises: + Exception: 展開に失敗した場合 + + Returns: + bytes: 展開後ファイルのバイナリ + """ + + gz_bytes = work_data_file.getvalue() + try: + # GZIP をバイト列ごと展開する + file_bytes = gzip.decompress(gz_bytes) + except (OSError, EOFError) as e: + raise Exception(f"GZIP 解凍に失敗しました: {e}") + + # ファイルを一時ディレクトリに書き出す。 + encoding = settings_list[SETTINGS_ITEM["charCode"]] + line_feed = LINE_FEED_CODE[settings_list[SETTINGS_ITEM["lineFeedCode"]]] + delimiter = settings_list[SETTINGS_ITEM["delimiter"]] + quote_char = convert_quotechar(settings_list[SETTINGS_ITEM["quotechar"]]) + + uncompressed_file = io.TextIOWrapper( + io.BytesIO(file_bytes), + encoding=encoding, + newline=line_feed + ) + csv_reader = csv.reader( + uncompressed_file, + quotechar=convert_quotechar(settings_list[SETTINGS_ITEM["quotechar"]]), + delimiter=delimiter + ) + + with open(LOCAL_TEMPORARY_FILE_PATH, 'w', encoding=encoding, newline='') as csvfile: + csv_writer = csv.writer( + csvfile, quotechar=quote_char, delimiter=delimiter) + for row in csv_reader: + csv_writer.writerow(row) + + return file_bytes diff --git a/ecs/dataimport/dataimport/main.py b/ecs/dataimport/dataimport/main.py index 13f9c0c9..13cdbfa1 100644 --- a/ecs/dataimport/dataimport/main.py +++ b/ecs/dataimport/dataimport/main.py @@ -4,9 +4,9 @@ from datetime import datetime import boto3 import pymysql -from chk import LOCAL_TEMPORARY_FILE_PATH -from common import (ERROR, INFO, LINE_FEED_CODE, MYSQL_CHARSET_CODE, - SETTINGS_ITEM, WARNING, convert_quotechar, debug_log) +from common import (ERROR, INFO, LINE_FEED_CODE, LOCAL_TEMPORARY_FILE_PATH, + MYSQL_CHARSET_CODE, SETTINGS_ITEM, WARNING, + convert_quotechar, debug_log) from error import error from pymysql.constants import CLIENT