Skip to content Skip to sidebar Skip to footer

Why Does My Scrapy Spider Only Scrape Some Of My Data?

I'm trying to use Scrapy to scrape IMDb data (episode information and cast list) for each episode of Law & Order: SVU. After I run the code below, I export it to CSV via the co

Solution 1:

I added changes to your code. Addition I show you how to use Items and Pipelines.

spiders/svu.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import EpisodeItem, CastItem

class SvuSpider(scrapy.Spider):
    name = "svu"
    start_urls = [
        'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
    ]

    def parse(self, response):
        # Gather episode information
        item = EpisodeItem(
            season=response.xpath("//div[@class='bp_heading']/text()")[0].extract(),
            episode=response.xpath("//div[@class='bp_heading']/text()")[1].extract(),
            episode_name=response.xpath("//h1[@itemprop='name']/text()").extract_first().strip(),
            date_published=response.xpath("//div[@class='subtext']/a/meta[@itemprop='datePublished']/@content").extract(),
            rating_value=response.xpath("//span[@itemprop='ratingValue']/text()").extract(),
            rating_count=response.xpath("//span[@itemprop='ratingCount']/text()").extract()
        )
        yield item

        # Follow link to full cast list
        for a in response.xpath("//div[@class='see-more']/a"):
            yield response.follow(a, callback=self.parse_cast)

        # Follow link to next episode
        for a in response.xpath("//a[@class='bp_item np_next']"):
            yield response.follow(a, callback=self.parse)

    def parse_cast(self,response):
        # Gather cast list data
        for actor in response.xpath("//table[@class='cast_list']"):
            character = response.xpath("//td[@class='character']/a/text()").extract()
            character.extend(response.xpath("//td[@class='character']/text()").extract())
            character = [c.strip().replace('\n ', '') for c in character if c.strip()]
            item = CastItem(
                actor=response.xpath("//td[@itemprop='actor']/a/span[@itemprop='name']/text()").extract(),
                character=character
            )
            yield item

items.py

from scrapy import Item, Field

    class EpisodeItem(Item):
        season = Field()
        episode = Field()
        episode_name = Field()
        date_published = Field()
        rating_value = Field()
        rating_count = Field()

    class CastItem(Item):
        actor = Field()
        character = Field()

pipelines.py

from scrapy import signals
from scrapy.exporters import CsvItemExporter

from .items import CastItem, EpisodeItem

classIMDBPipeline(object):

    @classmethoddeffrom_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    defspider_opened(self, spider):
        item_names = ['episode', 'cast']
        self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
        self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
        for exporter in self.exporters.values():
            exporter.start_exporting()

    defspider_closed(self, spider):
        for exporter in self.exporters.values():
            exporter.finish_exporting()

        for file in self.files.values():
            file.close()

    defprocess_item(self, item, spider):
        ifisinstance(item, EpisodeItem):
            self.exporters['episode'].export_item(item)

        ifisinstance(item, CastItem):
            self.exporters['cast'].export_item(item)

        return item

Add to settings file:

ITEM_PIPELINES = {
    'PROJECT_NAME.pipelines.IMDBPipeline': 300,
}

Be carefull. You need to replace PROJECT_NAME to yours.

Post a Comment for "Why Does My Scrapy Spider Only Scrape Some Of My Data?"