cela peut être utile …
import pdfplumber
import argparse
import os
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
extracted_text = [] # List to store extracted text from each page
with pdfplumber.open(pdf_path) as pdf: # Open the PDF file
# Loop through each page in the PDF
for page_number, page in enumerate(pdf.pages, start=1):
text = page.extract_text() # Extract text from the current page
if text: # If there is text on the page, add it to the list
extracted_text.append(f"Page {page_number}\n{text}\n")
return "\n".join(extracted_text) # Join all page text into a single string
# Main part of the script to handle argument parsing and file checks
if __name__ == "__main__":
# Set up an argument parser to take the PDF file path as input
parser = argparse.ArgumentParser(description="Extract text from a PDF file.")
parser.add_argument("pdf_path", type=str, help="Path to the PDF file.")
args = parser.parse_args() # Parse the arguments provided by the user
# Check if the file has a .pdf extension to ensure it's a PDF
if not args.pdf_path.lower().endswith(".pdf"):
print("Error: The specified file does not have a .pdf extension. Please provide a PDF file.")
# Check if the file actually exists on the filesystem
elif not os.path.isfile(args.pdf_path):
print("Error: The specified file does not exist.")
else:
# Extract text from the PDF file using the function
text = extract_text_from_pdf(args.pdf_path)
# Calculate the word count by splitting the text into words based on whitespace
word_count = len(text.split()) # Word count for the extracted text
output_file = "extracted_text.txt" # Define the output file name
# Write the extracted text to the output file in UTF-8 encoding
with open(output_file, "w", encoding="utf-8") as f:
f.write(text)
# Inform the user that the text extraction was successful and provide word count
print(f"Text extracted and saved to '{output_file}'")
print(f"Word count: {word_count}")