PDF Text Data Scraping Task1
import os
import re
import PyPDF2
from PyPDF2 import PdfReader
!python your_script.py
def extract_owner_names(pdf_path):
owner_names = []
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_number in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_number]
text = page.extract_text()
# Define your regular expression pattern for detecting project owner names
pattern = r'Project by: ([A-Za-z\s]+)'
matches = re.findall(pattern, text)
owner_names.extend(matches)
return owner_names
def main():
# Directory containing PDF files
pdf_directory = '/content/list_of_PDFs'
# List all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
for pdf_file in pdf_files:
pdf_path = os.path.join(pdf_directory, pdf_file)
owners = extract_owner_names(pdf_path)
if owners:
print(f"Project owners in '{pdf_file}': {', '.join(owners)}")
else:
print(f"No project owners found in '{pdf_file}'")
if __name__ == "__main__":
main()