python学习：python并发编程 11/12 异步io优化并发爬虫

python并发编程教程

异步io

异步io实现的爬虫

import aiohttp
import asyncio
import time

urls = [
  f'https://www.cnblogs.com/#p{page}'
  for page in range(1, 10 + 1)
]


async def async_crawl(url):
  async with aiohttp.ClientSession() as session:
    async with session.get(url) as response:
      res =  await response.text()
      print(f'Crawl url and len of res: {url}, {len(res)}')


loop = asyncio.get_event_loop()

tasks = [
  loop.create_task(async_crawl(url))
  for url in urls
]

# amain with loop
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print(f'Async total time: {end - start}')

信号量

控制并发度，总量控制在一定的范围内。

加入信号量，控制异步并发的代码如下

import aiohttp
import asyncio
import time

semaphore = asyncio.Semaphore(3)

urls = [
  f'https://www.cnblogs.com/#p{page}'
  for page in range(1, 10 + 1)
]


async def async_crawl(url):
  async with semaphore:
    async with aiohttp.ClientSession() as session:
      async with session.get(url) as response:
        res =  await response.text()
        print(f'Crawl url and len of res: {url}, {len(res)}')


loop = asyncio.get_event_loop()

tasks = [
  loop.create_task(async_crawl(url))
  for url in urls
]

# amain with loop
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print(f'Async total time: {end - start}')

参考

https://www.bilibili.com/video/BV1bK411A7tV/?p=11