From 336fa7412552e4062305bbf649d35eec0ee3fa10 Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Mon, 4 May 2020 19:02:15 -0400 Subject: [PATCH] Working initial scrape --- scripts/creatures/.gitignore | 135 ++++++++ scripts/creatures/Pipfile | 12 + scripts/creatures/Pipfile.lock | 324 ++++++++++++++++++ scripts/creatures/creatures/__init__.py | 0 scripts/creatures/creatures/items.py | 18 + scripts/creatures/creatures/middlewares.py | 103 ++++++ scripts/creatures/creatures/pipelines.py | 11 + scripts/creatures/creatures/settings.py | 90 +++++ .../creatures/creatures/spiders/__init__.py | 4 + .../creatures/creatures/spiders/creatures.py | 31 ++ scripts/creatures/scrapy.cfg | 11 + 11 files changed, 739 insertions(+) create mode 100644 scripts/creatures/.gitignore create mode 100644 scripts/creatures/Pipfile create mode 100644 scripts/creatures/Pipfile.lock create mode 100644 scripts/creatures/creatures/__init__.py create mode 100644 scripts/creatures/creatures/items.py create mode 100644 scripts/creatures/creatures/middlewares.py create mode 100644 scripts/creatures/creatures/pipelines.py create mode 100644 scripts/creatures/creatures/settings.py create mode 100644 scripts/creatures/creatures/spiders/__init__.py create mode 100644 scripts/creatures/creatures/spiders/creatures.py create mode 100644 scripts/creatures/scrapy.cfg diff --git a/scripts/creatures/.gitignore b/scripts/creatures/.gitignore new file mode 100644 index 0000000..5e63850 --- /dev/null +++ b/scripts/creatures/.gitignore @@ -0,0 +1,135 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +### Example user template template +### Example user template + +# IntelliJ project files +.idea +*.iml +out +gen diff --git a/scripts/creatures/Pipfile b/scripts/creatures/Pipfile new file mode 100644 index 0000000..853e486 --- /dev/null +++ b/scripts/creatures/Pipfile @@ -0,0 +1,12 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +scrapy = "*" + +[requires] +python_version = "3.8" diff --git a/scripts/creatures/Pipfile.lock b/scripts/creatures/Pipfile.lock new file mode 100644 index 0000000..a555f37 --- /dev/null +++ b/scripts/creatures/Pipfile.lock @@ -0,0 +1,324 @@ +{ + "_meta": { + "hash": { + "sha256": "0248cce57c7a462bbed0c9ce5d4a84b92d0e146d394de44d71af073668ad2c1d" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.8" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "attrs": { + "hashes": [ + "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", + "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" + ], + "version": "==19.3.0" + }, + "automat": { + "hashes": [ + "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33", + "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111" + ], + "version": "==20.2.0" + }, + "cffi": { + "hashes": [ + "sha256:001bf3242a1bb04d985d63e138230802c6c8d4db3668fb545fb5005ddf5bb5ff", + "sha256:00789914be39dffba161cfc5be31b55775de5ba2235fe49aa28c148236c4e06b", + "sha256:028a579fc9aed3af38f4892bdcc7390508adabc30c6af4a6e4f611b0c680e6ac", + "sha256:14491a910663bf9f13ddf2bc8f60562d6bc5315c1f09c704937ef17293fb85b0", + "sha256:1cae98a7054b5c9391eb3249b86e0e99ab1e02bb0cc0575da191aedadbdf4384", + "sha256:2089ed025da3919d2e75a4d963d008330c96751127dd6f73c8dc0c65041b4c26", + "sha256:2d384f4a127a15ba701207f7639d94106693b6cd64173d6c8988e2c25f3ac2b6", + "sha256:337d448e5a725bba2d8293c48d9353fc68d0e9e4088d62a9571def317797522b", + "sha256:399aed636c7d3749bbed55bc907c3288cb43c65c4389964ad5ff849b6370603e", + "sha256:3b911c2dbd4f423b4c4fcca138cadde747abdb20d196c4a48708b8a2d32b16dd", + "sha256:3d311bcc4a41408cf5854f06ef2c5cab88f9fded37a3b95936c9879c1640d4c2", + "sha256:62ae9af2d069ea2698bf536dcfe1e4eed9090211dbaafeeedf5cb6c41b352f66", + "sha256:66e41db66b47d0d8672d8ed2708ba91b2f2524ece3dee48b5dfb36be8c2f21dc", + "sha256:675686925a9fb403edba0114db74e741d8181683dcf216be697d208857e04ca8", + "sha256:7e63cbcf2429a8dbfe48dcc2322d5f2220b77b2e17b7ba023d6166d84655da55", + "sha256:8a6c688fefb4e1cd56feb6c511984a6c4f7ec7d2a1ff31a10254f3c817054ae4", + "sha256:8c0ffc886aea5df6a1762d0019e9cb05f825d0eec1f520c51be9d198701daee5", + "sha256:95cd16d3dee553f882540c1ffe331d085c9e629499ceadfbda4d4fde635f4b7d", + "sha256:99f748a7e71ff382613b4e1acc0ac83bf7ad167fb3802e35e90d9763daba4d78", + "sha256:b8c78301cefcf5fd914aad35d3c04c2b21ce8629b5e4f4e45ae6812e461910fa", + "sha256:c420917b188a5582a56d8b93bdd8e0f6eca08c84ff623a4c16e809152cd35793", + "sha256:c43866529f2f06fe0edc6246eb4faa34f03fe88b64a0a9a942561c8e22f4b71f", + "sha256:cab50b8c2250b46fe738c77dbd25ce017d5e6fb35d3407606e7a4180656a5a6a", + "sha256:cef128cb4d5e0b3493f058f10ce32365972c554572ff821e175dbc6f8ff6924f", + "sha256:cf16e3cf6c0a5fdd9bc10c21687e19d29ad1fe863372b5543deaec1039581a30", + "sha256:e56c744aa6ff427a607763346e4170629caf7e48ead6921745986db3692f987f", + "sha256:e577934fc5f8779c554639376beeaa5657d54349096ef24abe8c74c5d9c117c3", + "sha256:f2b0fa0c01d8a0c7483afd9f31d7ecf2d71760ca24499c8697aeb5ca37dc090c" + ], + "version": "==1.14.0" + }, + "constantly": { + "hashes": [ + "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35", + "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d" + ], + "version": "==15.1.0" + }, + "cryptography": { + "hashes": [ + "sha256:091d31c42f444c6f519485ed528d8b451d1a0c7bf30e8ca583a0cac44b8a0df6", + "sha256:18452582a3c85b96014b45686af264563e3e5d99d226589f057ace56196ec78b", + "sha256:1dfa985f62b137909496e7fc182dac687206d8d089dd03eaeb28ae16eec8e7d5", + "sha256:1e4014639d3d73fbc5ceff206049c5a9a849cefd106a49fa7aaaa25cc0ce35cf", + "sha256:22e91636a51170df0ae4dcbd250d318fd28c9f491c4e50b625a49964b24fe46e", + "sha256:3b3eba865ea2754738616f87292b7f29448aec342a7c720956f8083d252bf28b", + "sha256:651448cd2e3a6bc2bb76c3663785133c40d5e1a8c1a9c5429e4354201c6024ae", + "sha256:726086c17f94747cedbee6efa77e99ae170caebeb1116353c6cf0ab67ea6829b", + "sha256:844a76bc04472e5135b909da6aed84360f522ff5dfa47f93e3dd2a0b84a89fa0", + "sha256:88c881dd5a147e08d1bdcf2315c04972381d026cdb803325c03fe2b4a8ed858b", + "sha256:96c080ae7118c10fcbe6229ab43eb8b090fccd31a09ef55f83f690d1ef619a1d", + "sha256:a0c30272fb4ddda5f5ffc1089d7405b7a71b0b0f51993cb4e5dbb4590b2fc229", + "sha256:bb1f0281887d89617b4c68e8db9a2c42b9efebf2702a3c5bf70599421a8623e3", + "sha256:c447cf087cf2dbddc1add6987bbe2f767ed5317adb2d08af940db517dd704365", + "sha256:c4fd17d92e9d55b84707f4fd09992081ba872d1a0c610c109c18e062e06a2e55", + "sha256:d0d5aeaedd29be304848f1c5059074a740fa9f6f26b84c5b63e8b29e73dfc270", + "sha256:daf54a4b07d67ad437ff239c8a4080cfd1cc7213df57d33c97de7b4738048d5e", + "sha256:e993468c859d084d5579e2ebee101de8f5a27ce8e2159959b6673b418fd8c785", + "sha256:f118a95c7480f5be0df8afeb9a11bd199aa20afab7a96bcf20409b411a3a85f0" + ], + "version": "==2.9.2" + }, + "cssselect": { + "hashes": [ + "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", + "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" + ], + "version": "==1.1.0" + }, + "hyperlink": { + "hashes": [ + "sha256:4288e34705da077fada1111a24a0aa08bb1e76699c9ce49876af722441845654", + "sha256:ab4a308feb039b04f855a020a6eda3b18ca5a68e6d8f8c899cbe9e653721d04f" + ], + "version": "==19.0.0" + }, + "idna": { + "hashes": [ + "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", + "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" + ], + "version": "==2.9" + }, + "incremental": { + "hashes": [ + "sha256:717e12246dddf231a349175f48d74d93e2897244939173b01974ab6661406b9f", + "sha256:7b751696aaf36eebfab537e458929e194460051ccad279c72b755a167eebd4b3" + ], + "version": "==17.5.0" + }, + "lxml": { + "hashes": [ + "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd", + "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c", + "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081", + "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f", + "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261", + "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a", + "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9", + "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a", + "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb", + "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60", + "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128", + "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a", + "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717", + "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89", + "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72", + "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8", + "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3", + "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7", + "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8", + "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77", + "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1", + "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15", + "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679", + "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012", + "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6", + "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc", + "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca" + ], + "markers": "python_version != '3.4'", + "version": "==4.5.0" + }, + "parsel": { + "hashes": [ + "sha256:4da4262ba4605573b6b72a5f557616a2fc9dee7a47a1efad562752a28d366723", + "sha256:74f8e9d3b345b14cb1416bd777a03982cde33a74d8b32e0c71e651d07d41d40a" + ], + "version": "==1.5.2" + }, + "protego": { + "hashes": [ + "sha256:a682771bc7b51b2ff41466460896c1a5a653f9a1e71639ef365a72e66d8734b4" + ], + "version": "==0.1.16" + }, + "pyasn1": { + "hashes": [ + "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d", + "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba" + ], + "version": "==0.4.8" + }, + "pyasn1-modules": { + "hashes": [ + "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e", + "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74" + ], + "version": "==0.2.8" + }, + "pycparser": { + "hashes": [ + "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", + "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" + ], + "version": "==2.20" + }, + "pydispatcher": { + "hashes": [ + "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf", + "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433" + ], + "version": "==2.0.5" + }, + "pyhamcrest": { + "hashes": [ + "sha256:412e00137858f04bde0729913874a48485665f2d36fe9ee449f26be864af9316", + "sha256:7ead136e03655af85069b6f47b23eb7c3e5c221aa9f022a4fbb499f5b7308f29" + ], + "version": "==2.0.2" + }, + "pyopenssl": { + "hashes": [ + "sha256:621880965a720b8ece2f1b2f54ea2071966ab00e2970ad2ce11d596102063504", + "sha256:9a24494b2602aaf402be5c9e30a0b82d4a5c67528fe8fb475e3f3bc00dd69507" + ], + "version": "==19.1.0" + }, + "queuelib": { + "hashes": [ + "sha256:42b413295551bdc24ed9376c1a2cd7d0b1b0fa4746b77b27ca2b797a276a1a17", + "sha256:ff43b5b74b9266f8df4232a8f768dc4d67281a271905e2ed4a3689d4d304cd02" + ], + "version": "==1.5.0" + }, + "scrapy": { + "hashes": [ + "sha256:640aea0f9be9b055f5cfec5ab78ee88bb37a5be3809b138329bd2af51392ec7f", + "sha256:883ab2dcb6cafb22c7448616baeb5b4d7bedef57ca6a9a5f4e902cb1d6e7aeca" + ], + "index": "pypi", + "version": "==2.1.0" + }, + "service-identity": { + "hashes": [ + "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36", + "sha256:0858a54aabc5b459d1aafa8a518ed2081a285087f349fe3e55197989232e2e2d" + ], + "version": "==18.1.0" + }, + "six": { + "hashes": [ + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" + ], + "version": "==1.14.0" + }, + "twisted": { + "hashes": [ + "sha256:040eb6641125d2a9a09cf198ec7b83dd8858c6f51f6770325ed9959c00f5098f", + "sha256:147780b8caf21ba2aef3688628eaf13d7e7fe02a86747cd54bfaf2140538f042", + "sha256:158ddb80719a4813d292293ac44ba41d8b56555ed009d90994a278237ee63d2c", + "sha256:2182000d6ffc05d269e6c03bfcec8b57e20259ca1086180edaedec3f1e689292", + "sha256:25ffcf37944bdad4a99981bc74006d735a678d2b5c193781254fbbb6d69e3b22", + "sha256:3281d9ce889f7b21bdb73658e887141aa45a102baf3b2320eafcfba954fcefec", + "sha256:356e8d8dd3590e790e3dba4db139eb8a17aca64b46629c622e1b1597a4a92478", + "sha256:70952c56e4965b9f53b180daecf20a9595cf22b8d0935cd3bd664c90273c3ab2", + "sha256:7408c6635ee1b96587289283ebe90ee15dbf9614b05857b446055116bc822d29", + "sha256:7c547fd0215db9da8a1bc23182b309e84a232364cc26d829e9ee196ce840b114", + "sha256:894f6f3cfa57a15ea0d0714e4283913a5f2511dbd18653dd148eba53b3919797", + "sha256:94ac3d55a58c90e2075c5fe1853f2aa3892b73e3bf56395f743aefde8605eeaa", + "sha256:a58e61a2a01e5bcbe3b575c0099a2bcb8d70a75b1a087338e0c48dd6e01a5f15", + "sha256:c09c47ff9750a8e3aa60ad169c4b95006d455a29b80ad0901f031a103b2991cd", + "sha256:ca3a0b8c9110800e576d89b5337373e52018b41069bc879f12fa42b7eb2d0274", + "sha256:cd1dc5c85b58494138a3917752b54bb1daa0045d234b7c132c37a61d5483ebad", + "sha256:cdbc4c7f0cd7a2218b575844e970f05a1be1861c607b0e048c9bceca0c4d42f7", + "sha256:d267125cc0f1e8a0eed6319ba4ac7477da9b78a535601c49ecd20c875576433a", + "sha256:d72c55b5d56e176563b91d11952d13b01af8725c623e498db5507b6614fc1e10", + "sha256:d95803193561a243cb0401b0567c6b7987d3f2a67046770e1dccd1c9e49a9780", + "sha256:e92703bed0cc21d6cb5c61d66922b3b1564015ca8a51325bd164a5e33798d504", + "sha256:f058bd0168271de4dcdc39845b52dd0a4a2fecf5f1246335f13f5e96eaebb467", + "sha256:f3c19e5bd42bbe4bf345704ad7c326c74d3fd7a1b3844987853bef180be638d4" + ], + "version": "==20.3.0" + }, + "w3lib": { + "hashes": [ + "sha256:847704b837b2b973cddef6938325d466628e6078266bc2e1f7ac49ba85c34823", + "sha256:8b1854fef570b5a5fc84d960e025debd110485d73fd283580376104762774315" + ], + "version": "==1.21.0" + }, + "zope.interface": { + "hashes": [ + "sha256:0103cba5ed09f27d2e3de7e48bb320338592e2fabc5ce1432cf33808eb2dfd8b", + "sha256:14415d6979356629f1c386c8c4249b4d0082f2ea7f75871ebad2e29584bd16c5", + "sha256:1ae4693ccee94c6e0c88a4568fb3b34af8871c60f5ba30cf9f94977ed0e53ddd", + "sha256:1b87ed2dc05cb835138f6a6e3595593fea3564d712cb2eb2de963a41fd35758c", + "sha256:269b27f60bcf45438e8683269f8ecd1235fa13e5411de93dae3b9ee4fe7f7bc7", + "sha256:27d287e61639d692563d9dab76bafe071fbeb26818dd6a32a0022f3f7ca884b5", + "sha256:39106649c3082972106f930766ae23d1464a73b7d30b3698c986f74bf1256a34", + "sha256:40e4c42bd27ed3c11b2c983fecfb03356fae1209de10686d03c02c8696a1d90e", + "sha256:461d4339b3b8f3335d7e2c90ce335eb275488c587b61aca4b305196dde2ff086", + "sha256:4f98f70328bc788c86a6a1a8a14b0ea979f81ae6015dd6c72978f1feff70ecda", + "sha256:558a20a0845d1a5dc6ff87cd0f63d7dac982d7c3be05d2ffb6322a87c17fa286", + "sha256:562dccd37acec149458c1791da459f130c6cf8902c94c93b8d47c6337b9fb826", + "sha256:5e86c66a6dea8ab6152e83b0facc856dc4d435fe0f872f01d66ce0a2131b7f1d", + "sha256:60a207efcd8c11d6bbeb7862e33418fba4e4ad79846d88d160d7231fcb42a5ee", + "sha256:645a7092b77fdbc3f68d3cc98f9d3e71510e419f54019d6e282328c0dd140dcd", + "sha256:6874367586c020705a44eecdad5d6b587c64b892e34305bb6ed87c9bbe22a5e9", + "sha256:74bf0a4f9091131de09286f9a605db449840e313753949fe07c8d0fe7659ad1e", + "sha256:7b726194f938791a6691c7592c8b9e805fc6d1b9632a833b9c0640828cd49cbc", + "sha256:8149ded7f90154fdc1a40e0c8975df58041a6f693b8f7edcd9348484e9dc17fe", + "sha256:8cccf7057c7d19064a9e27660f5aec4e5c4001ffcf653a47531bde19b5aa2a8a", + "sha256:911714b08b63d155f9c948da2b5534b223a1a4fc50bb67139ab68b277c938578", + "sha256:a5f8f85986197d1dd6444763c4a15c991bfed86d835a1f6f7d476f7198d5f56a", + "sha256:a744132d0abaa854d1aad50ba9bc64e79c6f835b3e92521db4235a1991176813", + "sha256:af2c14efc0bb0e91af63d00080ccc067866fb8cbbaca2b0438ab4105f5e0f08d", + "sha256:b054eb0a8aa712c8e9030065a59b5e6a5cf0746ecdb5f087cca5ec7685690c19", + "sha256:b0becb75418f8a130e9d465e718316cd17c7a8acce6fe8fe07adc72762bee425", + "sha256:b1d2ed1cbda2ae107283befd9284e650d840f8f7568cb9060b5466d25dc48975", + "sha256:ba4261c8ad00b49d48bbb3b5af388bb7576edfc0ca50a49c11dcb77caa1d897e", + "sha256:d1fe9d7d09bb07228650903d6a9dc48ea649e3b8c69b1d263419cc722b3938e8", + "sha256:d7804f6a71fc2dda888ef2de266727ec2f3915373d5a785ed4ddc603bbc91e08", + "sha256:da2844fba024dd58eaa712561da47dcd1e7ad544a257482392472eae1c86d5e5", + "sha256:dcefc97d1daf8d55199420e9162ab584ed0893a109f45e438b9794ced44c9fd0", + "sha256:dd98c436a1fc56f48c70882cc243df89ad036210d871c7427dc164b31500dc11", + "sha256:e74671e43ed4569fbd7989e5eecc7d06dc134b571872ab1d5a88f4a123814e9f", + "sha256:eb9b92f456ff3ec746cd4935b73c1117538d6124b8617bc0fe6fda0b3816e345", + "sha256:ebb4e637a1fb861c34e48a00d03cffa9234f42bef923aec44e5625ffb9a8e8f9", + "sha256:ef739fe89e7f43fb6494a43b1878a36273e5924869ba1d866f752c5812ae8d58", + "sha256:f40db0e02a8157d2b90857c24d89b6310f9b6c3642369852cdc3b5ac49b92afc", + "sha256:f68bf937f113b88c866d090fea0bc52a098695173fc613b055a17ff0cf9683b6", + "sha256:fb55c182a3f7b84c1a2d6de5fa7b1a05d4660d866b91dbf8d74549c57a1499e8" + ], + "version": "==5.1.0" + } + }, + "develop": {} +} diff --git a/scripts/creatures/creatures/__init__.py b/scripts/creatures/creatures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/creatures/creatures/items.py b/scripts/creatures/creatures/items.py new file mode 100644 index 0000000..f5c3add --- /dev/null +++ b/scripts/creatures/creatures/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class Creature(scrapy.Item): + # define the fields for your item here like: + name = scrapy.Field() + family = scrapy.Field() + level = scrapy.Field() + publisher = scrapy.Field() + source = scrapy.Field() + url = scrapy.Field() diff --git a/scripts/creatures/creatures/middlewares.py b/scripts/creatures/creatures/middlewares.py new file mode 100644 index 0000000..05701a8 --- /dev/null +++ b/scripts/creatures/creatures/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class CreaturesSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class CreaturesDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/scripts/creatures/creatures/pipelines.py b/scripts/creatures/creatures/pipelines.py new file mode 100644 index 0000000..ee1fdd6 --- /dev/null +++ b/scripts/creatures/creatures/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +class CreaturesPipeline: + def process_item(self, item, spider): + return item diff --git a/scripts/creatures/creatures/settings.py b/scripts/creatures/creatures/settings.py new file mode 100644 index 0000000..768540f --- /dev/null +++ b/scripts/creatures/creatures/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for creatures project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'creatures' + +SPIDER_MODULES = ['creatures.spiders'] +NEWSPIDER_MODULE = 'creatures.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'creatures (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'creatures.middlewares.CreaturesSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'creatures.middlewares.CreaturesDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'creatures.pipelines.CreaturesPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/scripts/creatures/creatures/spiders/__init__.py b/scripts/creatures/creatures/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scripts/creatures/creatures/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scripts/creatures/creatures/spiders/creatures.py b/scripts/creatures/creatures/spiders/creatures.py new file mode 100644 index 0000000..5dad0be --- /dev/null +++ b/scripts/creatures/creatures/spiders/creatures.py @@ -0,0 +1,31 @@ +import scrapy +import datetime as dt +from creatures.items import Creature +from typing import List + + +class CreateListSpider(scrapy.Spider): + name = "creatures" + start_urls = ["https://pf2.d20pfsrd.com/monster"] + + def parse(self, response): + rows = response.xpath('//table[@id="archive-data-table"]' + '/tr') + creatures: List[Creature] = [] + for row in rows: + name = row.xpath('.//td[1]/a/text()').get() + family = row.xpath('.//td[2]/a/text()').get() + level = row.xpath('.//td[3]/a/text()').get() + publisher = row.xpath('.//td[4]/a/text()').get() + source = row.xpath('.//td[5]/a/text()').get() + url = row.xpath('.//td[1]/a/@href').get() + creatures.append(Creature(name=name, + family=family, + level=level, + publisher=publisher, + source=source, + url=url,)) + yield { + 'date': dt.datetime.now(tz=dt.timezone.utc), + 'data': creatures, + } diff --git a/scripts/creatures/scrapy.cfg b/scripts/creatures/scrapy.cfg new file mode 100644 index 0000000..8653322 --- /dev/null +++ b/scripts/creatures/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = creatures.settings + +[deploy] +#url = http://localhost:6800/ +project = creatures