From 328066a03f90229b333bfd6684b77ecc3d86d30c Mon Sep 17 00:00:00 2001 From: Brian Date: Fri, 9 Aug 2019 14:28:18 +0000 Subject: [PATCH] Add readmes, explanation and link for chromedriver, clear notebook outputs --- src/README.md | 9 ++++ src/requirements.txt | 4 ++ src/weapons/README.md | 7 +++ src/weapons/scrape.ipynb | 112 ++++----------------------------------- src/weapons/scrape.py | 112 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 141 insertions(+), 103 deletions(-) create mode 100644 src/README.md create mode 100644 src/requirements.txt create mode 100644 src/weapons/README.md create mode 100644 src/weapons/scrape.py diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..38076ae --- /dev/null +++ b/src/README.md @@ -0,0 +1,9 @@ +# The purpose of this directory is to perform scraping of [Archives of Nethys](https://2e.aonprd.com) + +## Requirements +1. Python 3.6.8 +2. pandas==0.24.2 +3. splinter==0.11.0 +4. beautifulsoup4==4.8.0 +5. selenium==3.141.0 +6. Download the [chrome WebDriver](https://splinter.readthedocs.io/en/latest/drivers/chrome.html) and place it in this directory. It is required by selenium and splinter as using Requests was prone to failure. \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..45a3499 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,4 @@ +pandas==0.24.2 +splinter==0.11.0 +beautifulsoup4==4.8.0 +selenium==3.141.0 \ No newline at end of file diff --git a/src/weapons/README.md b/src/weapons/README.md new file mode 100644 index 0000000..f7f5ea0 --- /dev/null +++ b/src/weapons/README.md @@ -0,0 +1,7 @@ +# This directory scrapes the weapons from the [Archives of Nethys](https://2e.aonprd.com/Weapons.aspx) + +## Steps to scrape the weapons +1. Install the requirements from [the previous readme](../README.md) +2. Generate .csv files from copy pasting the tables from [here](https://2e.aonprd.com/Weapons.aspx) and save them in this directory +3. Set the number_of_weapons variable to the number of weapons in the database it is currently 83 +4. Run the [python file](scrape.py) or [Jupyter Notebook](scrape.ipynb) \ No newline at end of file diff --git a/src/weapons/scrape.ipynb b/src/weapons/scrape.ipynb index 1de5ad3..2ca34e0 100644 --- a/src/weapons/scrape.ipynb +++ b/src/weapons/scrape.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -20,7 +20,7 @@ "from splinter import Browser\n", "\n", "# Setting up Selenium\n", - "chrome_driver = os.path.join('..', 'resources', 'chromedriver.exe')\n", + "chrome_driver = os.path.join('..', 'chromedriver.exe')\n", "executable_path = {'executable_path': chrome_driver}\n", "browser = Browser('chrome', **executable_path, headless=False)\n", "\n", @@ -30,103 +30,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Beginning Data Retrieval\n", - "------------------------\n", - "Processing Weapon 1 of 83 | Fist\n", - "Processing Weapon 2 of 83 | Club\n", - "Processing Weapon 3 of 83 | Dagger\n", - "Processing Weapon 4 of 83 | Gauntlet\n", - "Processing Weapon 5 of 83 | Light Mace\n", - "Processing Weapon 6 of 83 | Longspear\n", - "Processing Weapon 7 of 83 | Mace\n", - "Processing Weapon 8 of 83 | Morningstar\n", - "Processing Weapon 9 of 83 | Sickle\n", - "Processing Weapon 10 of 83 | Spear\n", - "Processing Weapon 11 of 83 | Spiked Gauntlet\n", - "Processing Weapon 12 of 83 | Staff\n", - "Processing Weapon 13 of 83 | Clan Dagger\n", - "Processing Weapon 14 of 83 | Katar\n", - "Processing Weapon 15 of 83 | Bastard Sword\n", - "Processing Weapon 16 of 83 | Battle Axe\n", - "Processing Weapon 17 of 83 | Bo Staff\n", - "Processing Weapon 18 of 83 | Falchion\n", - "Processing Weapon 19 of 83 | Flail\n", - "Processing Weapon 20 of 83 | Glaive\n", - "Processing Weapon 21 of 83 | Greataxe\n", - "Processing Weapon 22 of 83 | Greatclub\n", - "Processing Weapon 23 of 83 | Greatpick\n", - "Processing Weapon 24 of 83 | Greatsword\n", - "Processing Weapon 25 of 83 | Guisarme\n", - "Processing Weapon 26 of 83 | Halberd\n", - "Processing Weapon 27 of 83 | Hatchet\n", - "Processing Weapon 28 of 83 | Lance\n", - "Processing Weapon 29 of 83 | Light Hammer\n", - "Processing Weapon 30 of 83 | Light Pick\n", - "Processing Weapon 31 of 83 | Longsword\n", - "Processing Weapon 32 of 83 | Main-gauche\n", - "Processing Weapon 33 of 83 | Maul\n", - "Processing Weapon 34 of 83 | Pick\n", - "Processing Weapon 35 of 83 | Ranseur\n", - "Processing Weapon 36 of 83 | Rapier\n", - "Processing Weapon 37 of 83 | Sap\n", - "Processing Weapon 38 of 83 | Scimitar\n", - "Processing Weapon 39 of 83 | Scythe\n", - "Processing Weapon 40 of 83 | Shield Bash\n", - "Processing Weapon 41 of 83 | Shield Boss\n", - "Processing Weapon 42 of 83 | Shield Spikes\n", - "Processing Weapon 43 of 83 | Shortsword\n", - "Processing Weapon 44 of 83 | Starknife\n", - "Processing Weapon 45 of 83 | Trident\n", - "Processing Weapon 46 of 83 | War Flail\n", - "Processing Weapon 47 of 83 | Warhammer\n", - "Processing Weapon 48 of 83 | Whip\n", - "Processing Weapon 49 of 83 | Dogslicer\n", - "Processing Weapon 50 of 83 | Elven Curve Blade\n", - "Processing Weapon 51 of 83 | Filcher's Fork\n", - "Processing Weapon 52 of 83 | Gnome Hooked Hammer\n", - "Processing Weapon 53 of 83 | Horsechopper\n", - "Processing Weapon 54 of 83 | Kama\n", - "Processing Weapon 55 of 83 | Katana\n", - "Processing Weapon 56 of 83 | Kukri\n", - "Processing Weapon 57 of 83 | Nunchaku\n", - "Processing Weapon 58 of 83 | Orc Knuckle Dagger\n", - "Processing Weapon 59 of 83 | Sai\n", - "Processing Weapon 60 of 83 | Spiked Chain\n", - "Processing Weapon 61 of 83 | Temple Sword\n", - "Processing Weapon 62 of 83 | Dwarven War Axe\n", - "Processing Weapon 63 of 83 | Gnome Flickmace\n", - "Processing Weapon 64 of 83 | Orc Necksplitter\n", - "Processing Weapon 65 of 83 | Sawtooth Saber\n", - "Processing Weapon 66 of 83 | Blowgun\n", - "Processing Weapon 67 of 83 | Crossbow\n", - "Processing Weapon 68 of 83 | Dart\n", - "Processing Weapon 69 of 83 | Hand Crossbow\n", - "Processing Weapon 70 of 83 | Heavy Crossbow\n", - "Processing Weapon 71 of 83 | Javelin\n", - "Processing Weapon 72 of 83 | Sling\n", - "Processing Weapon 73 of 83 | Alchemical Bomb\n", - "Processing Weapon 74 of 83 | Composite Longbow\n", - "Processing Weapon 75 of 83 | Composite Shortbow\n", - "Processing Weapon 76 of 83 | Longbow\n", - "Processing Weapon 77 of 83 | Shortbow\n", - "Processing Weapon 78 of 83 | Halfling Sling Staff\n", - "Processing Weapon 79 of 83 | Shuriken\n", - "Processing Weapon 80 of 83 | Blowgun Darts\n", - "Processing Weapon 81 of 83 | Bolts\n", - "Processing Weapon 82 of 83 | Sling Bullets\n", - "Processing Weapon 83 of 83 | Arrows\n", - "------------------------\n", - "Data Retrieval Complete\n" - ] - } - ], + "outputs": [], "source": [ "# url that contains all the links\n", "url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID='\n", @@ -177,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -188,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -198,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -207,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -216,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ diff --git a/src/weapons/scrape.py b/src/weapons/scrape.py new file mode 100644 index 0000000..00ad14d --- /dev/null +++ b/src/weapons/scrape.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Scrape data from aon2e and generate csvs to import in to sqlite + +# In[ ]: + + +# Dependencies +import pandas as pd +from bs4 import BeautifulSoup as bs +import os +from splinter import Browser + +# Setting up Selenium +chrome_driver = os.path.join('..', 'chromedriver.exe') +executable_path = {'executable_path': chrome_driver} +browser = Browser('chrome', **executable_path, headless=False) + +# Pandas config +pd.set_option('display.max_columns', None) + + +# In[ ]: + + +# url that contains all the links +url_weapon = 'https://2e.aonprd.com/Weapons.aspx?ID=' + +# Number of weapons +number_of_weapons = 83 + +# Empty lists to store the scraped values +name_list = [] +description_list = [] + +print(f'Beginning Data Retrieval') +print(f'------------------------') + +# Loop from 1 to the value in weapon_number +for weapon in range(1, number_of_weapons+1): + + url = url_weapon + str(weapon) + browser.visit(url) + html = browser.html + soup = bs(html, 'html.parser') + + # Select only the content section + content = soup.find(id='ctl00_MainContent_DetailedOutput') + + try: + # Store the name and description + name = content.find('a').text.strip() + + except: + name = f'weapon: {weapon}' + + try: + description = content.find('hr').next.text.strip() + + except: + description = content.find('hr').next.strip() + + print(f'Processing Weapon {weapon} of {number_of_weapons} | {name}') + + # Append values to our empty lists + name_list.append(name) + description_list.append(description) + +print(f'------------------------') +print(f'Data Retrieval Complete') + + +# In[ ]: + + +# Directory of csv files which are taken from https://2e.aonprd.com/Weapons.aspx +melee = pd.read_csv('melee.csv') +ranged = pd.read_csv('ranged.csv') + + +# In[ ]: + + +data = {'Name': name_list, 'description': description_list} +scrape = pd.DataFrame(data) + + +# In[ ]: + + +melee = melee.merge(scrape, how='left', on='Name') + + +# In[ ]: + + +ranged = ranged.merge(scrape, how='left', on='Name') + + +# In[ ]: + + +melee.to_csv('melee.csv') +ranged.to_csv('ranged.csv') + + +# In[ ]: + + + +