| spiders/main_spider.py | class NcbiDownloadSpider(scrapy.Spider):
    name = 'ncbi_download'
    def parse(self, response, **kwargs):
        records = NcbiFasta.select().where(NcbiFasta.is_crawled == False).limit(100)
        for record in records:
            filename = f'{record.ncbi_id}.fasta'
            filepath = f'{record.group}/{filename}'
            yield {
                'ncbi_id': record.ncbi_id,
                'file_urls': [record.fasta_url],
                'filename': filepath,
            }
 | 
| pipelines.py | from itemadapter import ItemAdapter
from spider_classify_fasta.models.ncbi_fasta import NcbiFasta
from spider_classify_fasta.settings import FILES_STORE
from scrapy.pipelines.files import FilesPipeline
class SpiderClassifyFastaFilesPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        filename = item['filename']
        return filename
    def file_downloaded(self, response, request, info, *, item=None):
        ncbi_id = item['ncbi_id']
        checksum = super().file_downloaded(response, request, info, item=item)
        entity = NcbiFasta.select().where((NcbiFasta.ncbi_id == ncbi_id)).first()
        if entity:
            entity.is_crawled = True
            entity.md5 = checksum
            entity.save()
        return checksum
 |