spiders/main_spider.py | class NcbiDownloadSpider(scrapy.Spider):
name = 'ncbi_download'
def parse(self, response, **kwargs):
records = NcbiFasta.select().where(NcbiFasta.is_crawled == False).limit(100)
for record in records:
filename = f'{record.ncbi_id}.fasta'
filepath = f'{record.group}/{filename}'
yield {
'ncbi_id': record.ncbi_id,
'file_urls': [record.fasta_url],
'filename': filepath,
}
|
pipelines.py | from itemadapter import ItemAdapter
from spider_classify_fasta.models.ncbi_fasta import NcbiFasta
from spider_classify_fasta.settings import FILES_STORE
from scrapy.pipelines.files import FilesPipeline
class SpiderClassifyFastaFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
filename = item['filename']
return filename
def file_downloaded(self, response, request, info, *, item=None):
ncbi_id = item['ncbi_id']
checksum = super().file_downloaded(response, request, info, item=item)
entity = NcbiFasta.select().where((NcbiFasta.ncbi_id == ncbi_id)).first()
if entity:
entity.is_crawled = True
entity.md5 = checksum
entity.save()
return checksum
|