Python语言学习:scrapy-redis 包
一个能让你的爬虫以数倍速度运行的工具包
安装
pip install scrapy-redis -U
添加到项目
- 准备好 redis server
- 添加配置到
settings.py
- 特别留意:
RedisPipeline
这个的优先级,要调到最后(# Store scraped item in redis for post-processing.) - 原来的
scrapy.spider/xx
等调成 RedisSpider
settings.py 配置如下
# Enables scheduling storing requests queue in redis.
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# Enables stats shared based on Redis
STATS_CLASS = "scrapy_redis.stats.RedisStatsCollector"
# Default requests serializer is pickle, but it can be changed to any module
# with loads and dumps functions. Note that pickle is not compatible between
# python versions.
# Caveat: In python 3.x, the serializer must return strings keys and support
# bytes as values. Because of this reason the json or msgpack module will not
# work by default. In python 2.x there is no such issue and you can use
# 'json' or 'msgpack' as serializers.
#SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
# Don't cleanup redis queues, allows to pause/resume crawls.
#SCHEDULER_PERSIST = True
# Schedule requests using a priority queue. (default)
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# Alternative queues.
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
# Max idle time to prevent the spider from being closed when distributed crawling.
# This only works if queue class is SpiderQueue or SpiderStack,
# and may also block the same time when your spider start at the first time (because the queue is empty).
#SCHEDULER_IDLE_BEFORE_CLOSE = 10
# Maximum idle time before close spider.
# When the number of idle seconds is greater than MAX_IDLE_TIME_BEFORE_CLOSE, the crawler will close.
# If 0, the crawler will DontClose forever to wait for the next request.
# If negative number, the crawler will immediately close when the queue is empty, just like Scrapy.
#MAX_IDLE_TIME_BEFORE_CLOSE = 0
# Store scraped item in redis for post-processing.
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}
# The item pipeline serializes and stores the items in this redis key.
#REDIS_ITEMS_KEY = '%(spider)s:items'
# The items serializer is by default ScrapyJSONEncoder. You can use any
# importable path to a callable object.
#REDIS_ITEMS_SERIALIZER = 'json.dumps'
# Specify the host and port to use when connecting to Redis (optional).
#REDIS_HOST = 'localhost'
#REDIS_PORT = 6379
# Specify the full Redis URL for connecting (optional).
# If set, this takes precedence over the REDIS_HOST and REDIS_PORT settings.
#REDIS_URL = 'redis://user:pass@hostname:9001'
# Custom redis client parameters (i.e.: socket timeout, etc.)
#REDIS_PARAMS = {}
# Use custom redis client class.
#REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient'
# If True, it uses redis' ``SPOP`` operation. You have to use the ``SADD``
# command to add URLs to the redis queue. This could be useful if you
# want to avoid duplicates in your start urls list and the order of
# processing does not matter.
#REDIS_START_URLS_AS_SET = False
# If True, it uses redis ``zrevrange`` and ``zremrangebyrank`` operation. You have to use the ``zadd``
# command to add URLS and Scores to redis queue. This could be useful if you
# want to use priority and avoid duplicates in your start urls list.
#REDIS_START_URLS_AS_ZSET = False
# Default start urls key for RedisSpider and RedisCrawlSpider.
#REDIS_START_URLS_KEY = '%(name)s:start_urls'
# Use other encoding than utf-8 for redis.
#REDIS_ENCODING = 'latin1'
REDIS_URL = 'redis://default:123456@localhost:6379'
redis 配置<settings.py中>
REDIS_URL = 'redis://default:123456@localhost:6379'
改造 spider 代码如下
from scrapy_redis.spiders import RedisSpider
class MySpider(RedisSpider):
name = 'myspider'
def parse(self, response):
# do stuff
pass
参考