dChan - Q Origins Project Archive

Here is python source code to extract all the words from fauci's email drop and it will list what pages the words were found on.

import nltk

# importing required modules

import PyPDF2

nltk.download('punkt')

# creating a pdf file object

pdfFileObj = open('fauci.pdf', 'rb')

# creating a pdf reader object

pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

# printing number of pages in pdf file

print(pdfReader.numPages)

wordListDict = dict()

for pageIndex in range(0, pdfReader.numPages):

# creating a page object

pageObj = pdfReader.getPage(pageIndex)

# extracting text from page

print(pageIndex)

# wordList = re.split('\s+', )

wordList = nltk.word_tokenize(pageObj.extractText())

for word in wordList:

if word in wordListDict:

pageNumberList = wordListDict[word]

pageNumberList.append(pageIndex)

wordListDict[word] = pageNumberList

else:

wordListDict[word] = [pageIndex]

for x, y in wordListDict.items():

print(x, y)

# closing the pdf file object

pdfFileObj.close()