
# https://www.globalsino.com/ICs/
# Extract text files from pdf file page by page (one .txt file per pdf page)

import io
import pdfminer
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import os

thePDFfile = r"C:\0Python\Yougui_Liao.pdf"

# Layout analysis for all text: the space will be added between words with the two lines below
laparams = pdfminer.layout.LAParams()
setattr(laparams, 'all_texts', True)

def extract_text_page_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        thePageString = []
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            thePageString.append(page_interpreter.process_page(page))
            text = fake_file_handle.getvalue()
            yield text
    
            # close open handles
            converter.close()
            fake_file_handle.close()
        # global thePageNumber
        thePageNumber = len(thePageString)
        print("The page nubmer of the pdf file is::: ", thePageNumber)
    
def extract_and_save_text(pdf_path):
    n = 0
    extractedPages = extract_text_page_by_page(pdf_path)   
    for page in extractedPages:
        print(page)
        n = n+1
        print(n)
        theTextFilePath = os.path.join(r"C:\0Python", f"Yougui_Liao_{n}.txt")
        with open(theTextFilePath,"w", encoding="utf-8") as out_file:
            out_file.write(page)
            # for SinglePage in page:
            #    out_file.write(SinglePage)
        
        
if __name__ == '__main__':
    extract_and_save_text(thePDFfile)
