Scrapy学习: 下载文件 - cheatsheet

用 scrapy下载文件的相关记录,总结
更新于: 2023-05-29 21:13:00

基本步骤

  • 添加配置 FILES_STORE = "./downloads" 到 settings.py
  • pipelines 改写文件名,重载 file_path 方法
  • 在 spider 里实现 file_urls 以及 filename 相关逻辑

相关配置

文件名核心代码
settings.py
# pipelines
ITEM_PIPELINES = {
    "spider_classify_fasta.pipelines.SpiderClassifyFastaPipeline": 300,
    "spider_classify_fasta.pipelines.SpiderClassifyFastaFilesPipeline": 200,
}

# saved files
FILES_STORE = "./downloads"
spiders/main_spider.py
class NcbiDownloadSpider(scrapy.Spider):
    name = 'ncbi_download'
    def parse(self, response, **kwargs):
        records = NcbiFasta.select().where(NcbiFasta.is_crawled == False).limit(100)
        for record in records:
            filename = f'{record.ncbi_id}.fasta'
            filepath = f'{record.group}/{filename}'
            yield {
                'ncbi_id': record.ncbi_id,
                'file_urls': [record.fasta_url],
                'filename': filepath,
            }
pipelines.py
from itemadapter import ItemAdapter
from spider_classify_fasta.models.ncbi_fasta import NcbiFasta
from spider_classify_fasta.settings import FILES_STORE
from scrapy.pipelines.files import FilesPipeline


class SpiderClassifyFastaFilesPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        filename = item['filename']
        return filename

    def file_downloaded(self, response, request, info, *, item=None):
        ncbi_id = item['ncbi_id']
        checksum = super().file_downloaded(response, request, info, item=item)
        entity = NcbiFasta.select().where((NcbiFasta.ncbi_id == ncbi_id)).first()
        if entity:
            entity.is_crawled = True
            entity.md5 = checksum
            entity.save()
        return checksum

cheatsheet

常用功能代码片段
URL取文件名
import os
from urllib.parse import urlparse

def get_filename_from_url(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    # 获取URL路径的最后一个部分作为文件名
    filename = os.path.basename(path)
    return filename

# 使用
url = 'http://example.com/files/image.jpg'
filename = get_filename_from_url(url)
print(filename)  # 输出: image.jpg

参考