Anonymous ID: 0f1f71 March 18, 2025, 4:57 p.m. No.22784353   🗄️.is 🔗kun   >>4368 >>4407 >>4421 >>4485 >>4532 >>4602 >>4613

JFK files released + bonus python script to download all the .pdf files in bulk

 

https://www.archives.gov/research/jfk/release-2025

 

-------—

 

import requests

from bs4 import BeautifulSoup

import os

from urllib.parse import urljoin

 

def download_pdfs(url, download_folder="pdf_downloads"):

if not os.path.exists(download_folder):

os.makedirs(download_folder)

 

try:

headers = {'User-Agent': 'Mozilla/5.0'}

response = requests.get(url, headers=headers)

response.raise_for_status()

 

soup = BeautifulSoup(response.text, 'html.parser')

 

pdf_count = 0

for link in soup.find_all('a', href=True):

href = link['href']

 

absolute_url = urljoin(url, href)

 

if absolute_url.lower().endswith('.pdf'):

try:

pdf_response = requests.get(absolute_url, headers=headers)

pdf_response.raise_for_status()

 

filename = absolute_url.split('/')[-1]

if not filename.lower().endswith('.pdf'):

filename += '.pdf'

 

file_path = os.path.join(download_folder, filename)

with open(file_path, 'wb') as f:

f.write(pdf_response.content)

 

print(f"Downloaded: {filename}")

pdf_count += 1

 

except requests.RequestException as e:

print(f"Failed to download {absolute_url}: {e}")

 

print(f"nTotal PDFs downloaded: {pdf_count}")

if pdf_count == 0:

print("No PDF files found on the webpage.")

 

except requests.RequestException as e:

print(f"Error accessing webpage {url}: {e}")

except Exception as e:

print(f"An unexpected error occurred: {e}")

 

if name == "main":

target_url = "https://www.archives.gov/research/jfk/release-2025"

download_pdfs(target_url)

Anonymous ID: 0f1f71 March 18, 2025, 5:33 p.m. No.22784602   🗄️.is 🔗kun   >>4610 >>4615

>>22784353

 

This .py script will convert every .pdf file into .png images. This is so that you can upload them to AI and AI can read the text! You can use this to search for key words.

 

It has one big dependency, pdf2image uses poppler underneath the hood. You can download and extract the git repo (Release-24.08.0-0.zip) to your C: drive.

 

https://github.com/oschwartz10612/poppler-windows/releases

 

No need to install anything, just add C:/Poppler/Library/bin to your PATH environment variables.

 

Then run this handy dandy script:

 

import os

from pdf2image import convert_from_path

 

def convert_pdfs_to_images(input_folder="pdf_downloads", output_folder="pdf_images"):

"""

Convert all PDFs in input_folder to images and save them in output_folder.

 

Parameters:

- input_folder: Directory containing PDFs (default: 'pdf_downloads')

- output_folder: Directory to save images (default: 'pdf_images')

"""

if not os.path.exists(output_folder):

os.makedirs(output_folder)

 

if not os.path.exists(input_folder):

print(f"Input folder '{input_folder}' not found.")

return

 

for filename in os.listdir(input_folder):

if filename.lower().endswith('.pdf'):

pdf_path = os.path.join(input_folder, filename)

try:

images = convert_from_path(pdf_path)

 

for i, image in enumerate(images):

image_filename = f"{os.path.splitext(filename)[0]}page{i+1}.png"

image_path = os.path.join(output_folder, image_filename)

image.save(image_path, 'PNG')

print(f"Saved {image_filename}")

 

except Exception as e:

print(f"Error converting {filename}: {e}")

 

if name == "main":

convert_pdfs_to_images()