Script I created for automating web scraping
Tree: 5c-c
Find file
Copy path
Ministry-Books-Scrape / ministry_book_scrape.py / Jump to
jerrytigerxu Updates
201d28f on Jan 21
1 contributor
Raw
Blame
History
129 lines (105 sloc)
4.74 KB
You're using jump to definition to discover and navigate code.
1
from selenium import webdriver
2
import webbrowser, os, requests, bs4, pdfkit
3
from selenium.common.exceptions import NoSuchElementException
Learn more or give us feedback
4
5
6
# Create a function that checks if an element you are searching for exists (focus on search
7
def link_exists(search_text): # based on browser.find_element_by_partial_link_text() method
8
elementDisplayed = False # This is the default
9
try:
10
browser.find_element_by_partial_link_text(search_text)
11
elementDisplayed = True
12
except NoSuchElementException:
13
14
elementDisplayed = False
return elementDisplayed
15
16
17
# Function that checks if there are no more sections in the book (allowing the program to m
18
def no_more_sections():
19
noSections = False
20
try:
21
browser.find_element_by_css_selector("a[class='button radius disabled']")
22
if browser.find_element_by_css_selector("a[class='button radius disabled']").text =
-
noSections = False
else:
noSections = True
except NoSuchElementException:
noSections = False
return noSections
29
30
31
browser = webdriver.Chrome()
# Opens to the chrome browser
32
next_chapter_link = 'next chapter'
33
next_section_link = 'next section'
34
clickSpeed = 0.03
35
homePage = 'https://www.ministrybooks.org/life-studies.cfm'
36
# First going to the "home page" -> in this case it is the life studies page
37
bookname_links = []
# Amount of time for browser to wait before clicking link
38
39
browser.get(homePage)
# This is the starting point
40
browser.implicitly_wait(clickSpeed)
41
42
list_div = browser.find_element_by_class_name('large-10')
# div element where all of the b
43
links = list_div.find_elements_by_partial_link_text('Life-Study')
# Get all of the links f
44
45
# the browser finds all of the book names based on the links and appends them to the bookna
46
for link in links:
47
bookname_links.append(link.text)
48
49
50
# Saving the html files in the Documents folder
51
os.chdir('/Users/zane/Documents/ministry_book_texts')
52
53
# loops through the process for every single book on the bookname_links list
54
for bookNum in range(len(bookname_links)):
55
browser.get(homePage)
56
browser.implicitly_wait(clickSpeed)
57
58
# Create an html file that stores all of the text you scrape
59
html_string = """
60
61
62
63
Title
64
65
71
72
73
"""
74
75
savePrefix = './ministry_text_%s' % (bookname_links[bookNum])
76
saveFile = savePrefix + '.html'
77
text_file = open(saveFile, 'w')
78
text_file.write(html_string)
79
text_file.close()
80
81
# Find the link to the particular book and click that link
82
if link_exists(bookname_links[bookNum]):
83
link = browser.find_element_by_link_text(bookname_links[bookNum])
84
browser.implicitly_wait(clickSpeed)
85
link.click()
86
87
else: # Corresponds with first link - Going to particular book
print('Was not able to find a link with that name.')
88
89
# Stop the while loop when there is no next_chapter_link or next_section_link left in t
90
91
while True:
92
if not (link_exists(next_chapter_link) or link_exists(next_section_link)):
93
print('There is no button called "next chapter" or "next section"')
94
break
95
96
else:
if link_exists(next_chapter_link):
97
link = browser.find_element_by_partial_link_text(next_chapter_link)
98
link.click()
99
browser.implicitly_wait(clickSpeed)
100
elif link_exists(next_section_link):
101
link = browser.find_element_by_partial_link_text(next_section_link)
102
link.click()
103
browser.implicitly_wait(clickSpeed)
104
html = browser.page_source
105
soup = bs4.BeautifulSoup(html, features="html.parser")
106
html = list(soup.children)[5]
107
body = list(html.children)[3]
108
ministry_text_tag = (body.select('#ministry-text'))[0]
109
ministry_text = str(ministry_text_tag)
110
text_file = open(saveFile, 'a')
111
text_file.write(ministry_text)
112
text_file.close()
113
if no_more_sections():
114
115
break
browser.implicitly_wait(clickSpeed)
116
117
# Add the rest of the html file text to close it off (completes the html file)
118
text_file = open(saveFile, 'a')
119
text_file.write("")
120
121
# Converting html file into pdf file
122
convertFile = savePrefix + '.pdf'
123
pdfkit.from_file(saveFile, convertFile)
124
125
# Delete the html file, leaving behind only the pdfs
126
os.remove(saveFile)
127
128