scrapy学习:用 scrapy 开发一个项目 cheatsheet
用scrapy开发一个线上项目
准备一个数据库 mysql
# 连接数据库
mysql -uroot -p123456 -h127.0.0.1
# 创建一个数据库
create database spider_aizhan default character set utf8mb4 collate utf8mb4_unicode_ci;
# 确认创建成功
show databases;
添加自己的工具包 jsw-site-toolbox
# 切到 poetry 虚拟环境
source $(poetry env info --path)/bin/activate
# 安装必要的工具包
http_proxy=http://127.0.0.1:9090 https_proxy=http://127.0.0.1:9090 pip install git+http://github.com/aric-pypi/jsw-site-toolbox.git#egg=jsw-site-toolbox -U
定义一个 scrapy
任务
import scrapy
import jsw_nx as nx
class CikuEntrySpider(scrapy.Spider):
name = 'ciku_entry'
def start_requests(self):
self.logger.warn('start_requests only task')
yield scrapy.Request(url="https://www.baidu.com", callback=nx.noop_scrapy_parse)
运行 spider
# 正常运行 scrapy
scrapy crawl ciku_entry
# 带环境的运行 scrapy 任务
poetry run scrapy crawl ciku_entry
添加一个通用的 pipeline
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from jsw_site_toolbox.scrapy.base_pipeline import BasePipeline
class SpiderAizhanPipeline(BasePipeline):
def process_ciku_entry(self, item, spider):
spider.logger.warn(self.item_dict)
spider.logger.warn('item spider.name:' + spider.name)
return item
利用 pipeline
自动存储 item
到数据库
- ENTITY_AUTO_SAVE:这个是根据
entity
,来自动入库 - ENTITY_MAPPING:
spider
与enitity_class
的对应 get_id
:这个是根据entity_class
的某个id来查询对应的entity
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import jsw_nx as nx
from jsw_site_toolbox.scrapy.base_pipeline import BasePipeline
from spider_aizhan.models.ciku_entry import CikuEntry
from spider_aizhan.models.ciku_domain_page import CikuDomainPage
from spider_aizhan.models.ciku_domain_url import CikuDomainUrl
from spider_aizhan.models.ciku_keyword_url import CikuKeywordUrl
from spider_aizhan.models.ciku_keyword import CikuKeyword
class SpiderAizhanPipeline(BasePipeline):
ENTITY_AUTO_SAVE = True
ENTITY_MAPPING = {
'ciku_entry': CikuEntry,
'ciku_site': CikuEntry,
'ciku_domain_page': CikuDomainPage,
'ciku_domain_url': CikuDomainUrl,
'ciku_keyword_url': CikuKeywordUrl,
'ciku_keyword': CikuKeyword,
}
def get_id(self, item, spider):
sp1 = ['ciku_entry', 'ciku_site']
sp2 = ['ciku_domain_page', 'ciku_domain_url', 'ciku_keyword_url']
if nx.includes(sp1, spider.name):
return 'entry_id'
if nx.includes(sp2, spider.name):
return 'url'
if spider.name == 'ciku_keyword':
return 'keyword'
process_item
: 根据 item
添加一个 entity
# 其中一个 process_item 方法
def process_ciku_entry(self, item, spider):
entity = CikuEntry.first_or_create(**item)
entity.save()
return item
调试 main.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl('ciku_entry', url="https://ciku.aizhan.com/c35/")
process.start()
常用命令列表
功能 | 命令 | 备注 |
---|---|---|
新建项目 |
|
|
新建爬虫 |
| 新建一个名称为 abc_keyword 的爬虫 |
新建 model |
| 新建 CikuDomainUrl 的model,并生成 migration |
新建 migration |
| 新建 create_ciku_keywords_table 的 migration |
pm2 相关 |
| 进程守护 |
yo: 新建 model/spider |
| 自己开发的脚手架 |
Pycharm 环境设置
python request
与 scrapy.Request
- https://stackoverflow.com/questions/56230826/how-to-use-python-requests-with-scrapy
from scrapy.http import HtmlResponse
resp = requests.get(url)
response = HtmlResponse(url="", body=resp.text, encoding='utf-8')
用 scrapy 发送 post 请求
yield Request(url, method="POST", body=json.dumps(data), headers={'Content-Type': 'application/json'},callback=self.parse_json)