Here is python source code to extract all the words from fauci's email drop and it will list what pages the words were found on.
import nltk
# importing required modules
import PyPDF2
nltk.download('punkt')
# creating a pdf file object
pdfFileObj = open('fauci.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# printing number of pages in pdf file
print(pdfReader.numPages)
wordListDict = dict()
for pageIndex in range(0, pdfReader.numPages):
# creating a page object
pageObj = pdfReader.getPage(pageIndex)
# extracting text from page
print(pageIndex)
# wordList = re.split('\s+', )
wordList = nltk.word_tokenize(pageObj.extractText())
for word in wordList:
if word in wordListDict:
pageNumberList = wordListDict[word]
pageNumberList.append(pageIndex)
wordListDict[word] = pageNumberList
else:
wordListDict[word] = [pageIndex]
for x, y in wordListDict.items():
print(x, y)
# closing the pdf file object
pdfFileObj.close()