Copy and paste the "spider" code below into a new file, such as `spider-template.py`. Edit the "User variables" and then run it from Terminal.
Copy and paste the "spider" code below into a new file, such as spider-template.py
. Edit the "User variables" then run it from Terminal with a command like the following:
torsocks -i scrapy runspider spider-template.py -o extracted_data.csv
Scrapy "spider" file
import scrapy
class ExtractingTheInfeasible(scrapy.Spider):
### User variables
#
start_urls = ['https://some.website.com/some/page']
name = 'spider_template'
row_selector = '.your .row > .selector'
next_selector = 'a.next_page.selector::attr(href)'
column_selectors = {
'some_column': '.contents.of > .some.column::text',
'some_other_column': '.contents.of > a.different.column::attr(href)',
'a_third_column': '.contents.of > .a3rd.column::text'
}
# http_user: '???'
# http_pass: '?????????'
#
###
custom_settings = {
# 'DOWNLOAD_DELAY': '30',
# 'DEPTH_LIMIT': '100',
# 'ITEM_PIPELINES': {
# 'scrapy.pipelines.files.FilesPipeline': 1,
# 'scrapy.pipelines.images.ImagesPipeline': 1
# },
# 'IMAGES_STORE': 'media',
# 'IMAGES_THUMBS': { 'small': (50, 50) },
# 'FILES_STORE': 'files',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:45.0) Gecko/20100101 Firefox/45.0',
'TELNETCONSOLE_ENABLED': False,
'DOWNLOAD_HANDLERS': {'s3': None}
}
def parse(self, response):
row_selector = self.row_selector
next_selector = self.next_selector
column_selectors = self.column_selectors
url_prefix = "/".join(response.url.split('/')[:3])
for row in response.css(row_selector):
yields = dict()
for item in column_selectors.keys():
item_content = row.css(column_selectors[item]).extract_first()
if item == "image_urls" and item_content:
yields[item] = [url_prefix + item_content]
elif item == "file_urls" and item_content:
yields[item] = [url_prefix + item_content]
elif item == "link" and item_content:
yields[item] = url_prefix + item_content
else:
yields[item] = item_content
yield yields
if next_selector:
next_page = response.css(next_selector).extract_first()
if next_page:
yield scrapy.Request(response.urljoin(next_page), callback=self.parse)