Jerry Xu | Freelancer Script I Created For Automating Web Scraping

Script I created for automating web scraping

Tree: 5c-c Find file Copy path Ministry-Books-Scrape / ministry_book_scrape.py / Jump to jerrytigerxu Updates 201d28f on Jan 21 1 contributor Raw Blame History 129 lines (105 sloc) 4.74 KB You're using jump to definition to discover and navigate code. 1 from selenium import webdriver 2 import webbrowser, os, requests, bs4, pdfkit 3 from selenium.common.exceptions import NoSuchElementException Learn more or give us feedback 4 5 6 # Create a function that checks if an element you are searching for exists (focus on search 7 def link_exists(search_text): # based on browser.find_element_by_partial_link_text() method 8 elementDisplayed = False # This is the default 9 try: 10 browser.find_element_by_partial_link_text(search_text) 11 elementDisplayed = True 12 except NoSuchElementException: 13 14 elementDisplayed = False return elementDisplayed 15 16 17 # Function that checks if there are no more sections in the book (allowing the program to m 18 def no_more_sections(): 19 noSections = False 20 try: 21 browser.find_element_by_css_selector("a[class='button radius disabled']") 22 if browser.find_element_by_css_selector("a[class='button radius disabled']").text = - noSections = False else: noSections = True except NoSuchElementException: noSections = False return noSections 29 30 31 browser = webdriver.Chrome() # Opens to the chrome browser 32 next_chapter_link = 'next chapter' 33 next_section_link = 'next section' 34 clickSpeed = 0.03 35 homePage = 'https://www.ministrybooks.org/life-studies.cfm' 36 # First going to the "home page" -> in this case it is the life studies page 37 bookname_links = [] # Amount of time for browser to wait before clicking link 38 39 browser.get(homePage) # This is the starting point 40 browser.implicitly_wait(clickSpeed) 41 42 list_div = browser.find_element_by_class_name('large-10') # div element where all of the b 43 links = list_div.find_elements_by_partial_link_text('Life-Study') # Get all of the links f 44 45 # the browser finds all of the book names based on the links and appends them to the bookna 46 for link in links: 47 bookname_links.append(link.text) 48 49 50 # Saving the html files in the Documents folder 51 os.chdir('/Users/zane/Documents/ministry_book_texts') 52 53 # loops through the process for every single book on the bookname_links list 54 for bookNum in range(len(bookname_links)): 55 browser.get(homePage) 56 browser.implicitly_wait(clickSpeed) 57 58 # Create an html file that stores all of the text you scrape 59 html_string = """ 60 61 62 63 Title 64 65 71 72 73 """ 74 75 savePrefix = './ministry_text_%s' % (bookname_links[bookNum]) 76 saveFile = savePrefix + '.html' 77 text_file = open(saveFile, 'w') 78 text_file.write(html_string) 79 text_file.close() 80 81 # Find the link to the particular book and click that link 82 if link_exists(bookname_links[bookNum]): 83 link = browser.find_element_by_link_text(bookname_links[bookNum]) 84 browser.implicitly_wait(clickSpeed) 85 link.click() 86 87 else: # Corresponds with first link - Going to particular book print('Was not able to find a link with that name.') 88 89 # Stop the while loop when there is no next_chapter_link or next_section_link left in t 90 91 while True: 92 if not (link_exists(next_chapter_link) or link_exists(next_section_link)): 93 print('There is no button called "next chapter" or "next section"') 94 break 95 96 else: if link_exists(next_chapter_link): 97 link = browser.find_element_by_partial_link_text(next_chapter_link) 98 link.click() 99 browser.implicitly_wait(clickSpeed) 100 elif link_exists(next_section_link): 101 link = browser.find_element_by_partial_link_text(next_section_link) 102 link.click() 103 browser.implicitly_wait(clickSpeed) 104 html = browser.page_source 105 soup = bs4.BeautifulSoup(html, features="html.parser") 106 html = list(soup.children)[5] 107 body = list(html.children)[3] 108 ministry_text_tag = (body.select('#ministry-text'))[0] 109 ministry_text = str(ministry_text_tag) 110 text_file = open(saveFile, 'a') 111 text_file.write(ministry_text) 112 text_file.close() 113 if no_more_sections(): 114 115 break browser.implicitly_wait(clickSpeed) 116 117 # Add the rest of the html file text to close it off (completes the html file) 118 text_file = open(saveFile, 'a') 119 text_file.write("") 120 121 # Converting html file into pdf file 122 convertFile = savePrefix + '.pdf' 123 pdfkit.from_file(saveFile, convertFile) 124 125 # Delete the html file, leaving behind only the pdfs 126 os.remove(saveFile) 127 128