
# https://www.globalsino.com/ICs/
# Convert PDF file to text file

import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

thePathOfPDFfile = r"C:\0Python\Yougui_Liao.pdf"

# Layout analysis for all text
laparams = pdfminer.layout.LAParams()
setattr(laparams, 'all_texts', True)

def pdfExtractionFunc(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    converter.close()
    fake_file_handle.close()

    if text:
        return text

text = pdfExtractionFunc(thePathOfPDFfile)
print(text)

print("\n")

# Save the extracted text to a txt file
with open(r"C:\0Python\Yougui_Liao.txt","w", encoding="utf-8") as out_file:
    for line in text:
        print(line)
        out_file.write(line)
