Script python pour extraire de texte de fichier pdf

cela peut être utile …
import pdfplumber  
import argparse  
import os  

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    extracted_text = []  # List to store extracted text from each page
    with pdfplumber.open(pdf_path) as pdf:  # Open the PDF file
        # Loop through each page in the PDF
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()  # Extract text from the current page
            if text:  # If there is text on the page, add it to the list
                extracted_text.append(f"Page {page_number}\n{text}\n")
    return "\n".join(extracted_text)  # Join all page text into a single string

# Main part of the script to handle argument parsing and file checks
if __name__ == "__main__":
    # Set up an argument parser to take the PDF file path as input
    parser = argparse.ArgumentParser(description="Extract text from a PDF file.")
    parser.add_argument("pdf_path", type=str, help="Path to the PDF file.")
    args = parser.parse_args()  # Parse the arguments provided by the user

    # Check if the file has a .pdf extension to ensure it's a PDF
    if not args.pdf_path.lower().endswith(".pdf"):
        print("Error: The specified file does not have a .pdf extension. Please provide a PDF file.")
    # Check if the file actually exists on the filesystem
    elif not os.path.isfile(args.pdf_path):
        print("Error: The specified file does not exist.")
    else:
        # Extract text from the PDF file using the function
        text = extract_text_from_pdf(args.pdf_path)
        # Calculate the word count by splitting the text into words based on whitespace
        word_count = len(text.split())  # Word count for the extracted text

        output_file = "extracted_text.txt"  # Define the output file name
        # Write the extracted text to the output file in UTF-8 encoding
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(text)
        
        # Inform the user that the text extraction was successful and provide word count
        print(f"Text extracted and saved to '{output_file}'")
        print(f"Word count: {word_count}")
Script python pour extraire de texte de fichier pdf

Laisser un commentaire Annuler la réponse.