Scraper for refugee resources in Berlin


Copy and paste the "spider" code below into a new file, such as spider-kreuzberg.py. Run it from Terminal with a command like the following:

torsocks -i scrapy runspider spider-kreuzberg.py -o kreuzberg_data.csv

Scrapy "spider" file

import scrapy

class ExtractingTheInfeasible(scrapy.Spider):

    ### User variables
    # 
    start_urls = ['http://www.berlin.de/ba-friedrichshain-kreuzberg/aktuelles/fluechtlingshilfe/angebote-im-bezirk/traeger-und-aemter/?q=&sprache=--+Alles+--&q_geo=&ipp=5#searchresults']
    name = 'refugee_resources'
    row_selector = 'tbody tr' 
    next_selector = '.pager-item-next > a:nth-child(1)::attr(href)' 
    item_selectors = { 
        'organisation': 'td:nth-child(1) > div > p', 
        'offer': 'td:nth-child(2)::text', 
        'language': 'td:nth-child(3)::text', 
        'address': 'td:nth-child(4)::text', 
        'link': 'td:nth-child(1) > a:nth-child(2)::attr(href)' 
    }
    #
    ###

    custom_settings = {
        # 'DOWNLOAD_DELAY': '30',
        # 'DEPTH_LIMIT': '100',
        # 'ITEM_PIPELINES': {
        #     'scrapy.pipelines.files.FilesPipeline': 1,
        #     'scrapy.pipelines.images.ImagesPipeline': 1
        # },
        # 'IMAGES_STORE': 'media',
        # 'IMAGES_THUMBS': { 'small': (50, 50) },
        # 'FILES_STORE': 'files',
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:45.0) Gecko/20100101 Firefox/45.0',
        'TELNETCONSOLE_ENABLED': False,
        'DOWNLOAD_HANDLERS': {'s3': None}
    }    

    def parse(self, response):

        row_selector = self.row_selector
        next_selector = self.next_selector
        item_selectors = self.item_selectors
        url_prefix = "/".join(response.url.split('/')[:3])
        
        for row in response.css(row_selector):
            yields = dict()
            for item in item_selectors.keys():
                item_content = row.css(item_selectors[item]).extract_first()
                if item == "image_urls" and item_content:
                    yields[item] = [url_prefix + item_content]
                elif item == "link" and item_content:
                    yields[item] = url_prefix + item_content
                else: 
                    yields[item] = item_content
            yield yields
            
        if next_selector:
            next_page = response.css(next_selector).extract_first()
            if next_page:
                yield scrapy.Request(response.urljoin(next_page), callback=self.parse)