Scrapy学习: 下载文件 - cheatsheet

用 scrapy下载文件的相关记录，总结

基本步骤

添加配置 FILES_STORE = "./downloads" 到 settings.py
pipelines 改写文件名，重载 file_path 方法
在 spider 里实现 file_urls 以及 filename 相关逻辑

相关配置

文件名核心代码

settings.py

文件名	核心代码
settings.py	`# pipelines ITEM_PIPELINES = { "spider_classify_fasta.pipelines.SpiderClassifyFastaPipeline": 300, "spider_classify_fasta.pipelines.SpiderClassifyFastaFilesPipeline": 200, } # saved files FILES_STORE = "./downloads"`
spiders/main_spider.py	`class NcbiDownloadSpider(scrapy.Spider): name = 'ncbi_download' def parse(self, response, **kwargs): records = NcbiFasta.select().where(NcbiFasta.is_crawled == False).limit(100) for record in records: filename = f'{record.ncbi_id}.fasta' filepath = f'{record.group}/{filename}' yield { 'ncbi_id': record.ncbi_id, 'file_urls': [record.fasta_url], 'filename': filepath, }`
pipelines.py	from itemadapter import ItemAdapter from spider_classify_fasta.models.ncbi_fasta import NcbiFasta from spider_classify_fasta.settings import FILES_STORE from scrapy.pipelines.files import FilesPipeline class SpiderClassifyFastaFilesPipeline(FilesPipeline): def file_path(self, request, response=None, info=None, , item=None): filename = item['filename'] return filename def file_downloaded(self, response, request, info, , item=None): ncbi_id = item['ncbi_id'] checksum = super().file_downloaded(response, request, info, item=item) entity = NcbiFasta.select().where((NcbiFasta.ncbi_id == ncbi_id)).first() if entity: entity.is_crawled = True entity.md5 = checksum entity.save() return checksum

# pipelines
ITEM_PIPELINES = {
    "spider_classify_fasta.pipelines.SpiderClassifyFastaPipeline": 300,
    "spider_classify_fasta.pipelines.SpiderClassifyFastaFilesPipeline": 200,
}

# saved files
FILES_STORE = "./downloads"

spiders/main_spider.py

class NcbiDownloadSpider(scrapy.Spider):
    name = 'ncbi_download'
    def parse(self, response, **kwargs):
        records = NcbiFasta.select().where(NcbiFasta.is_crawled == False).limit(100)
        for record in records:
            filename = f'{record.ncbi_id}.fasta'
            filepath = f'{record.group}/{filename}'
            yield {
                'ncbi_id': record.ncbi_id,
                'file_urls': [record.fasta_url],
                'filename': filepath,
            }

pipelines.py

from itemadapter import ItemAdapter
from spider_classify_fasta.models.ncbi_fasta import NcbiFasta
from spider_classify_fasta.settings import FILES_STORE
from scrapy.pipelines.files import FilesPipeline


class SpiderClassifyFastaFilesPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        filename = item['filename']
        return filename

    def file_downloaded(self, response, request, info, *, item=None):
        ncbi_id = item['ncbi_id']
        checksum = super().file_downloaded(response, request, info, item=item)
        entity = NcbiFasta.select().where((NcbiFasta.ncbi_id == ncbi_id)).first()
        if entity:
            entity.is_crawled = True
            entity.md5 = checksum
            entity.save()
        return checksum

cheatsheet

常用功能代码片段

URL取文件名

常用功能	代码片段
URL取文件名	`import os from urllib.parse import urlparse def get_filename_from_url(url): parsed_url = urlparse(url) path = parsed_url.path # 获取URL路径的最后一个部分作为文件名 filename = os.path.basename(path) return filename # 使用 url = 'http://example.com/files/image.jpg' filename = get_filename_from_url(url) print(filename) # 输出: image.jpg`

import os
from urllib.parse import urlparse

def get_filename_from_url(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    # 获取URL路径的最后一个部分作为文件名
    filename = os.path.basename(path)
    return filename

# 使用
url = 'http://example.com/files/image.jpg'
filename = get_filename_from_url(url)
print(filename)  # 输出: image.jpg

参考

https://github.com/alo7i/spider-classify-fasta