From 9217748e15965d700ab58234a9741a7df02fad8f Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Fri, 16 Aug 2019 18:40:48 -0400 Subject: [PATCH] Customise for backgrounds --- src/backgrounds/scrape_background.ipynb | 35 ++++++++++++++++++------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/backgrounds/scrape_background.ipynb b/src/backgrounds/scrape_background.ipynb index 22c032a..ca5ad00 100644 --- a/src/backgrounds/scrape_background.ipynb +++ b/src/backgrounds/scrape_background.ipynb @@ -54,21 +54,36 @@ " content = soup.find(id='ctl00_MainContent_DetailedOutput')\n", "\n", " try:\n", - " # Store the name and description\n", - " name = content.find('h1', class_='title')\n", - " name.span.decompose()\n", - " name = name.text\n", + " # Store the name\n", + " name = content.find('h1', class_='title').a.text.strip()\n", + " name\n", "\n", " except:\n", " name = f'name: {page}'\n", "\n", " try:\n", + " # Start the loop after the link to the book\n", + " start = content.find('a', class_='external-link').next_sibling\n", " description = ''\n", - " start = content.find('hr')\n", " for e in start.next_siblings:\n", " if isinstance(e, Tag):\n", - " description = description + e.text.strip()\n", + " if e.name == 'br':\n", + " if e.next_sibling.name == 'br':\n", + " # If the next 2 elements are br skip this\n", + " # loop it will be handled in the elif\n", + " continue\n", + " elif e.previous_sibling.name == 'br':\n", + " # If this element and the previous are br\n", + " # and the next is not append \\n\n", + " description = description + ' \\n\\n '\n", + " else:\n", + " # If there is just one br append \\n\n", + " description = description + ' \\n '\n", + " else:\n", + " # Append the text inside the element\n", + " description = description + e.text.strip()\n", " elif isinstance(e, NavigableString):\n", + " # Since it is just a text append it\n", " description = description + e\n", "\n", " except:\n", @@ -97,10 +112,10 @@ "outputs": [], "source": [ "# scrape the descriptions\n", - "url_background = 'https://2e.aonprd.com/Equipment.aspx?ID='\n", - "number_background = 65 #65 to scrape\n", + "url_background = 'https://2e.aonprd.com/Backgrounds.aspx?ID='\n", + "number_background = 50 # number to scrape\n", "\n", - "description_background = scrape_description(url_gear, number_gear)" + "description_background = scrape_description(url_background, number_background)" ] }, { @@ -109,7 +124,7 @@ "metadata": {}, "outputs": [], "source": [ - "gear.to_csv('background.csv')" + "description_background.to_csv('background.csv', encoding='UTF-8', index=False)" ] }, {