Python繁转简

import os
import codecs
import opencc
import chardet
import shutil
from tqdm import tqdm


# 中文转换
class ChineseConvert(object):
    # 初始化
    def __init__(self, text_format, input_path):
        # 转换器
        self.converter = opencc.OpenCC(text_format)
        # 输入路径
        self.input_path = input_path
        # 输出路径
        self.output_path = os.path.join(input_path, "output")
        # 文件路径
        self.file_list = []
        self.new_file_list = []
        # 文件编码
        self.file_encoding_list = []
        self.new_file_encoding_list = []
        # 错误文件列表
        self.error_file_list = []

    # 转换文件
    def convert_files(self, index):
        # 获取编码
        encoding = self.new_file_encoding_list[index]
        # 获取文件名
        file_name = os.path.basename(self.new_file_list[index])
        # 输出文件路径
        output_file_path = os.path.join(self.output_path, self.converter.convert(file_name))
        # 判断编码
        if encoding is None:
            encoding = "utf-8"
        # 读取文件
        try:
            with open(self.new_file_list[index], "r", encoding=encoding) as input_file:
                content = input_file.read()
            # 转换
            convert_content = self.converter.convert(content)
            # 写入文件
            with codecs.open(output_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(convert_content)
        except UnicodeDecodeError:
            self.error_file_list.append(self.new_file_list[index])

    # 获取文件列表
    def obtain_file_list(self):
        for i in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, i)
            if os.path.isfile(file_path):
                self.file_list.append(file_path)

    # 检测文件编码
    def detect_file_encoding(self, file_path):
        with open(file_path, "rb") as file:
            data = file.read(1000)
        # 检查并获取文件编码
        encoding_info = chardet.detect(data)
        file_encoding = encoding_info["encoding"]
        if file_encoding is not None:
            if "GB" in file_encoding or "gb" in file_encoding:
                file_encoding = "GB18030"
            self.file_encoding_list.append(str(file_encoding).lower())
        else:
            self.file_encoding_list.append(file_encoding)

    # 中文检测
    def detect_simplified_chinese(self, index):
        # 获取编码
        encoding = self.file_encoding_list[index]
        # 判断编码
        if encoding is None:
            encoding = "utf-8"
        # 读取内容
        try:
            with open(self.file_list[index], "r", encoding=encoding) as input_file:
                content = input_file.read(1000)
            # 转换文本
            simplified_content = self.converter.convert(content)
            # 是否为简体, 如何不是简体就进行转换
            if simplified_content != content:
                self.new_file_list.append(self.file_list[index])
                self.new_file_encoding_list.append(self.file_encoding_list[index])
            else:
                # 如果是简体文集就直接复制到输出目录
                # 获取文件名
                file_name = os.path.basename(self.file_list[index])
                # 输出文件路径
                output_file_path = os.path.join(self.output_path, self.converter.convert(file_name))
                shutil.copy(self.file_list[index], output_file_path)
        except UnicodeDecodeError:
            self.error_file_list.append(self.file_list[index])

    def main(self):
        # 获取文件夹
        self.obtain_file_list()
        # 先删除, 后创建
        if os.path.exists(self.output_path):
            shutil.rmtree(self.output_path)
        # 创建输出文件夹
        if not os.path.exists(self.output_path):
            os.mkdir(self.output_path)
        # 检测文件编码
        for i in tqdm(self.file_list, desc="格式检测中"):
            self.detect_file_encoding(i)
        # 繁简检测中
        print("检测前文件数量: {}".format(len(self.file_list)))
        for i in tqdm(range(0, len(self.file_encoding_list)), desc="繁简检测中"):
            self.detect_simplified_chinese(i)
        print("检测后文件数量: {}".format(len(self.new_file_list)))
        # 转换文件格式中
        for i in tqdm(range(0, len(self.new_file_list)), desc="格式转换中"):
            self.convert_files(i)
        print("准换完成!")
        print("错误文件列表: {}".format(self.error_file_list))


if __name__ == '__main__':
    cc = ChineseConvert("t2s.json", "繁体文件路径")
    cc.main()
    pass
本作品采用知识共享署名-非商业性使用 4.0 国际许可协议进行许可
Python繁转简

文章评论