スキャンしたPDFの処理を考えるその4 まとめとおまけ（画像圧縮・zipでまとめる）

https://s-densan.hatenablog.com/entry/2021/02/11/ocr3の続きです。

処理のステップ

以下のステップをつなげて、1つのツールとして完成させます。

PDFを画像に変換する
→popplerに含まれるpdfimagesを使いました。
画像にOCR処理をかける
→OCRを行うGoogleのツールtesseractを使いました。
検索の仕組みを考える
→画像にExifを情報を埋め込むツールexiftoolを使いました。

プログラム

今までのプログラムを含めて、つながるようにちょいちょいと改修して以下のようにしました。

# ocr4.py
import json
import tempfile
import os
from typing import List, Dict
import glob
import subprocess

def main():
    # 入力フォルダ
    src_dir = r'C:\ocr_src'
    # src_dir = r'C:\Users\Shimpei Ueno\Documents\ScanSnap'
    dst_dir = r'C:\ocr_dst'

    # 入力フォルダ内のファイル一覧を取得
    pdf_path_list = glob.glob(os.path.join(src_dir, '*.pdf'))

    text_path_list = []
    for pdf_path in pdf_path_list:
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        dst_dir_onefile = os.path.join(dst_dir, pdf_name)
        if os.path.exists(dst_dir_onefile):
            continue
        image_path_list = pdf2image(pdf_path, dst_dir_onefile)
        for image_path in image_path_list:
            text = ocr_text(image_path)
            text_path = os.path.splitext(image_path)[0] + '.txt'
            with open(text_path, 'w') as fp:
                fp.write(text)
            set_exif(image_path, text)
    print('完了')
    print('\n'.join(text_path_list))

def pdf2image(src_pdf_path: str, dst_path: str) -> List[str]:
    """
    PDFファイルを複数の画像にして保存する
    
    Parameters
    ----------
    src_pdf_path : str
        入力PDFファイルパス
    dst_path : str
        出力先フォルダパス
    
    Returns
    -------
    List[str]
        作成した画像のファイルパス
    """
    # プログラム名(必要ならばフルパスで指定)
    program = 'pdfimages'
    # 実行するコマンド
    command = f'{program} -j "{src_pdf_path}" out'
    # command = f'{program} -j "{src_pdf_path}" out'

    # 入力PDFファイルが存在しない場合は終了
    if not os.path.isfile(src_pdf_path):
        return []

    # 出力先フォルダが存在しない場合は作成
    if not os.path.isdir(dst_path):
        if os.path.isfile(dst_path):
            return
        else:
            os.makedirs(dst_path)
    # 移行前のカレントディレクトリを変数に保持
    curdir_tmp = os.getcwd()
    # カレントディレクトリを変更
    os.chdir(dst_path)
    # コマンド実行
    os.system(command)
    # カレントディレクトリをもとに戻す
    os.chdir(curdir_tmp)

    file_list = glob.glob(os.path.join(dst_path, 'out-*.png'))
    # file_list = glob.glob(os.path.join(dst_path, 'out-*.jpg'))
    return file_list


def set_exif(image_path: str, comment: str) -> bool:
    """
    一つの画像ファイルに対し、exifコメント(XP Comment)を設定する。
    
    Parameters
    ----------
    image_path : str
        設定対象の画像ファイルパス
    comment : str
        XP Commentに設定する文字列
    
    Returns
    -------
    bool
        成否(true:成功, false:失敗)
    """
    # プログラム名(必要ならばフルパスで指定)
    program = 'exiftool'

    json_data = [ {'XPComment': comment} ]

    # 入力画像ファイルが存在しない場合は終了
    if not os.path.isfile(image_path):
        return False

    with tempfile.TemporaryDirectory() as dname:
        json_path = os.path.join(dname, 'tmp.json')
        with open(json_path, 'w') as fp:

            fp.write(json.dumps(json_data))
            fp.close()

            # 実行するコマンド
            command = f'{program} -overwrite_original -json="{fp.name}" "{image_path}"'

            # コマンド実行
            try:
                print(command)
                res = subprocess.check_output(command)
            except Exception as ex:
                print(f'コマンド実行中にエラーが発生しました。command = {command}, Exception = {ex}')
                return False
            return True

def ocr_text(src_image_path: str) -> str:
    """
    一つの画像ファイルに対し、日本語のOCR処理を行い、そのテキストを返却する。
    
    Parameters
    ----------
    src_image_path : str
        入力画像ファイルパス
    
    Returns
    -------
    str
        OCR結果のテキスト
    """
    # プログラム名(必要ならばフルパスで指定)
    program = 'tesseract'
    # 実行するコマンド
    command = f'{program} -l jpn "{src_image_path}" stdout'

    # 入力画像ファイルが存在しない場合は終了
    if not os.path.isfile(src_image_path):
        return False

    # 出力先フォルダが存在しない場合は作成
    # コマンド実行
    try:
        res = subprocess.check_output(command)
    except Exception as ex:
        print(f'コマンド実行中にエラーが発生しました。command = {command}, Exception = {ex}')
        return False
    res_text = res.decode('utf-8').replace(' ', '').replace('\r\n', '\n')
    return res_text

if __name__ == "__main__":
    main()

以上でツールの完成です。

おまけ

ページ毎に画像バラバラでは扱いづらいので以下も実施しました。

画像を縮小、かつ形式をwebp(サイズと画質のバランスが良いとのこと)にしてサイズ節約。
zipでアーカイブして1ファイルにまとめる。拡張子は漫画本アプリなどで使われるcbzとする。

なお、これをすると検索はできなくなります（本末転倒では。。。）個人で使う場合はEvernoteとかで管理するので、本文にOCRしたテキストを貼っておけば検索できますので、それより扱いやすさを重視しました。

画像の変換にはImageMagickのコマンドmagick、zip圧縮は7-Zipについてくるコマンド7zを使いました。

import json
import tempfile
import os
from typing import List, Dict
import glob
import subprocess
import shutil

def main():
    # 入力フォルダ
    src_dir = r'C:\ocr_src'
    # src_dir = r'C:\Users\Shimpei Ueno\Documents\ScanSnap'
    dst_dir = r'C:\ocr_dst'

    # 入力フォルダ内のファイル一覧を取得
    pdf_path_list = glob.glob(os.path.join(src_dir, '*.pdf'))

    text_path_list = []
    for pdf_path in pdf_path_list:
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        dst_dir_onefile = os.path.join(dst_dir, pdf_name)
        if os.path.exists(dst_dir_onefile):
            continue
        image_path_list = pdf2image(pdf_path, dst_dir_onefile)
        for image_path in image_path_list:
            text = ocr_text(image_path)
            text_path = os.path.splitext(image_path)[0] + '.txt'
            with open(text_path, 'w') as fp:
                fp.write(text)
            set_exif_res = set_exif(image_path, text)
            if set_exif_res:
                # 画像変換
                conv_path = os.path.splitext(image_path)[0] + '.webp'
                conv_image(image_path, conv_path, quality=60, long_side = 1920)
                # conv_path = os.path.splitext(image_path)[0] + '.jpg'
                # conv_image(image_path, conv_path, quality=60, long_side = 1920)
                os.remove(image_path)
                text_path_list.append(conv_path)
        # zip圧縮
        zip_file_path = dst_dir_onefile + '.cbz'
        archive_zip(dst_dir_onefile, zip_file_path)
        # zip圧縮前フォルダ削除
        shutil.rmtree(dst_dir_onefile)

    print('完了')
    print('\n'.join(text_path_list))

def pdf2image(src_pdf_path: str, dst_path: str) -> List[str]:
    """
    PDFファイルを複数の画像にして保存する
    
    Parameters
    ----------
    src_pdf_path : str
        入力PDFファイルパス
    dst_path : str
        出力先フォルダパス
    
    Returns
    -------
    List[str]
        作成した画像のファイルパス
    """
    # プログラム名(必要ならばフルパスで指定)
    program = 'pdfimages'
    # 実行するコマンド
    command = f'{program} -png "{src_pdf_path}" out'
    # command = f'{program} -j "{src_pdf_path}" out'

    # 入力PDFファイルが存在しない場合は終了
    if not os.path.isfile(src_pdf_path):
        return []

    # 出力先フォルダが存在しない場合は作成
    if not os.path.isdir(dst_path):
        if os.path.isfile(dst_path):
            return
        else:
            os.makedirs(dst_path)
    # 移行前のカレントディレクトリを変数に保持
    curdir_tmp = os.getcwd()
    # カレントディレクトリを変更
    os.chdir(dst_path)
    # コマンド実行
    os.system(command)
    # カレントディレクトリをもとに戻す
    os.chdir(curdir_tmp)

    file_list = glob.glob(os.path.join(dst_path, 'out-*.png'))
    # file_list = glob.glob(os.path.join(dst_path, 'out-*.jpg'))
    return file_list


def set_exif(image_path: str, comment: str) -> bool:
    """
    一つの画像ファイルに対し、exifコメント(XP Comment)を設定する。
    
    Parameters
    ----------
    image_path : str
        設定対象の画像ファイルパス
    comment : str
        XP Commentに設定する文字列
    
    Returns
    -------
    bool
        成否(true:成功, false:失敗)
    """
    # プログラム名(必要ならばフルパスで指定)
    program = 'exiftool'

    json_data = [ {'XPComment': comment} ]

    # 入力画像ファイルが存在しない場合は終了
    if not os.path.isfile(image_path):
        return False

    with tempfile.TemporaryDirectory() as dname:
        json_path = os.path.join(dname, 'tmp.json')
        with open(json_path, 'w') as fp:

            fp.write(json.dumps(json_data))
            fp.close()

            # 実行するコマンド
            command = f'{program} -overwrite_original -json="{fp.name}" "{image_path}"'

            # コマンド実行
            try:
                print(command)
                res = subprocess.check_output(command)
            except Exception as ex:
                print(f'コマンド実行中にエラーが発生しました。command = {command}, Exception = {ex}')
                return False
            return True

def ocr_text(src_image_path: str) -> str:
    """
    一つの画像ファイルに対し、日本語のOCR処理を行い、そのテキストを返却する。
    
    Parameters
    ----------
    src_image_path : str
        入力画像ファイルパス
    
    Returns
    -------
    str
        OCR結果のテキスト
    """
    # プログラム名(必要ならばフルパスで指定)
    program = 'tesseract'
    # 実行するコマンド
    command = f'{program} -l jpn "{src_image_path}" stdout'

    # 入力画像ファイルが存在しない場合は終了
    if not os.path.isfile(src_image_path):
        return False

    # 出力先フォルダが存在しない場合は作成
    # コマンド実行
    try:
        res = subprocess.check_output(command)
    except Exception as ex:
        print(f'コマンド実行中にエラーが発生しました。command = {command}, Exception = {ex}')
        return False
    res_text = res.decode('utf-8').replace(' ', '').replace('\r\n', '\n')
    return res_text




def conv_image(src_image_path: str, dst_image_path: str, quality: int = None, long_side: int = None):
    """
    画像ファイルをmagickで別形式に変換する。
    
    Parameters
    ----------
    src_image_path : str
        入力画像ファイルパス
    dst_image_path : str
        出力画像ファイルパス
    quality : int
        画質(0～100)
    
    """
    # プログラム名(必要ならばフルパスで指定)
    program = 'magick'

    # 入力画像ファイルが存在しない場合は終了
    if not os.path.isfile(src_image_path):
        return False


    # 実行するコマンド
    if quality is not None:
        quality_option = f'-quality {quality}'
    else:
        quality_option = ''

    if long_side is not None:
        resize_option = f'-resize "{long_side}x{long_side}>"'
    else:
        resize_option = ''

    command = f'{program} {quality_option} "{src_image_path}" {resize_option} "{dst_image_path}"'


    # コマンド実行
    try:
        print(command)
        res = subprocess.check_output(command)
    except Exception as ex:
        print(f'コマンド実行中にエラーが発生しました。command = {command}, Exception = {ex}')
        return False
    return True


def archive_zip(src_dir_path: str, dst_zip_path: str):
    """
    zip形式でフォルダをアーカイブする（無圧縮）
    
    Parameters
    ----------
    src_dir_path : str
        入力フォルダパス
    dst_zip_path : str
        出力zipファイルパス
    """
    # プログラム名(必要ならばフルパスで指定)
    program = '7z'

    # 入力画像ファイルが存在しない場合は終了
    if not os.path.isdir(src_dir_path):
        return False

    # 実行するコマンド

    command = f'{program} a -tzip -mx=0 "{dst_zip_path}" "{src_dir_path}"'

    # コマンド実行
    try:
        print(command)
        res = subprocess.check_output(command)
    except Exception as ex:
        print(f'コマンド実行中にエラーが発生しました。command = {command}, Exception = {ex}')
        return False
    return True

if __name__ == "__main__":
    main()

電算倶楽部　富山県のコンピュータ社会人サークル

富山県、特に滑川市、富山市、魚津市周辺で活動している社会人サークルです。

スキャンしたPDFの処理を考えるその4 まとめとおまけ（画像圧縮・zipでまとめる）

処理のステップ

プログラム

おまけ