pip install scrapy-redis-bloomfilter-block-clusterpython
# 确保使用此调度程序 SCHEDULER = "scrapy_redis_bloomfilter_block_cluster.scheduler.Scheduler" # 持久化 SCHEDULER_PERSIST = True # 确保全部蜘蛛经过redis共享相同的重复过滤器 DUPEFILTER_CLASS = "scrapy_redis_bloomfilter_block_cluster.dupefilter.RFPDupeFilter" # 队列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis_bloomfilter_block_cluster.queue.PriorityQueue' # Redis的URL # REDIS_URL = '的Redis://:为admin123 @本地:6379' #或redis的://本地主机:6379 # REDIS_HOST = '本地主机' # REDIS_PORT = 6379 # Redis的集群,若是REDIS_MASTER_NODES设置,REDIS_URL不起做用。 REDIS_CLUSTER_NODES = [ {"host": "", "port": ""}, {"host": "", "port": ""}, {"host": "", "port": ""}, {"host": "", "port": ""}, {"host": "", "port": ""}, {"host": "", "port": ""}, ] # 要使用的哈希函数数,至少 为6 BLOOMFILTER_HASH_NUMBER = 6 # 布隆过滤器用法的Redis存储位,30个装置2 ^ 30 = 128MB,赋予数值30 BLOOMFILTER_BIT = 30 # 使用Bloomfilter的块数,一个块能够使用最大内存512MB BLOOMFILTER_BLOCK_NUM = 1 DUPEFILTER_DEBUG = True
#不想过滤起始URL的设置
打开爬虫主程序
把dont_filter赋值为True便可
其余的也是若是不想过滤文章URL则在爬取文章方法里设置一样便可web
Request = scrapy.Request(url=config['start_url'], callback=self.parse, dont_filter=True)