From 34018405a452f3baa93671b64101a5724062cf24 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Fri, 16 Aug 2019 18:56:56 -0400 Subject: [PATCH] Create scrape_background.py --- src/backgrounds/scrape_background.py | 112 +++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 src/backgrounds/scrape_background.py diff --git a/src/backgrounds/scrape_background.py b/src/backgrounds/scrape_background.py new file mode 100644 index 0000000..d3852d8 --- /dev/null +++ b/src/backgrounds/scrape_background.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Scrape data from aon2e and generate csvs to import in to sqlite + +# In[ ]: + + +# Dependencies +import pandas as pd +from bs4 import BeautifulSoup as bs, Tag, NavigableString +import os +from splinter import Browser + +# Setting up Selenium +chrome_driver = os.path.join('..', 'chromedriver.exe') +executable_path = {'executable_path': chrome_driver} +browser = Browser('chrome', **executable_path, headless=False) + +# Pandas config +pd.set_option('display.max_columns', None) + + +# In[ ]: + + +def scrape_description(url, id_number): + + # Empty lists to store the scraped values + name_list = [] + description_list = [] + + print(f'Beginning Data Retrieval') + print(f'------------------------') + + # Loop from 1 to the value in weapon_number + for page in range(1, id_number+1): + + browser.visit(url + str(page)) + html = browser.html + soup = bs(html, 'html.parser') + + # Select only the content section + content = soup.find(id='ctl00_MainContent_DetailedOutput') + + try: + # Store the name + name = content.find('h1', class_='title').a.text.strip() + name + + except: + name = f'name: {page}' + + try: + # Start the loop after the link to the book + start = content.find('a', class_='external-link').next_sibling + description = '' + for e in start.next_siblings: + if isinstance(e, Tag): + if e.name == 'br': + if e.next_sibling.name == 'br': + # If the next 2 elements are br skip this + # loop it will be handled in the elif + continue + elif e.previous_sibling.name == 'br': + # If this element and the previous are br + # and the next is not append /n + description = description + ' /n/n' + else: + # If there is just one br append /n + description = description + ' /n' + else: + # Append the text inside the element + description = description + e.text.strip() + elif isinstance(e, NavigableString): + # Since it is just a text append it + description = description + e + + except: + description = f'name: {page}' + + print(f'{page} of {id_number} | {name}') + + # Append values to our empty lists + name_list.append(name) + description_list.append(description) + + print(f'------------------------') + print(f'Data Retrieval Complete') + + # Create df with the scraped data + data = {'Name': name_list, 'description': description_list} + + # Returns a data frame + return pd.DataFrame(data) + + +# In[ ]: + + +# scrape the descriptions +url_background = 'https://2e.aonprd.com/Backgrounds.aspx?ID=' +number_background = 50 # number to scrape + +description_background = scrape_description(url_background, number_background) + + +# In[ ]: + + +description_background.to_csv('background.csv', encoding='UTF-8', index=False) +