有MJJ说python的爬取效率低,我不太赞同
下面简单测试下python的异步爬取速度,其实也不低
环境:win7+python3.6
使用的库:aiohttp asyncio csv bs4
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import csv
import time
async def write_csv():
urllist=['https://www.aitaotu.com/guonei/list_{}.html'.format(x) for x in range(605)]\
+['https://www.aitaotu.com/rihan/list_{}.html'.format(x) for x in range(202)]\
+['https://www.aitaotu.com/gangtai/list_{}.html'.format(x) for x in range(33)]\
+['https://www.aitaotu.com/meinv/list_{}.html'.format(x) for x in range(89)]\
headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4',
'Host':'www.aitaotu.com',
'Referer':'https://www.aitaotu.com/guonei/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
}
with open('aitaotu.csv', 'a+', newline='', encoding='utf-8') as f:
f_csv = csv.writer(f)
for url in urllist:
async with aiohttp.ClientSession() as session:
async with session.get(url, verify_ssl=False, headers=headers) as resp:
print(resp.status)
if resp.status==200:
respdata=await resp.text()
page = BeautifulSoup(respdata,'lxml')
hrefs = page.select('#infinite_scroll > div > div.item_t > div > a')
img_urls = page.select('#infinite_scroll > div > div.item_t > div > a > img')
titles = page.select('#infinite_scroll > div > div.item_b.clearfix > div.title > span > a')
for href1, img_url1, title1 in zip(hrefs, img_urls, titles):
href=href1.get('href')
img_url=img_url1.get('data-original')
title=title1.get_text()
data={'链接':'https://www.aitaotu.com/'+href,
'图片':img_url,
'标题':title
}
print(data)
f_csv.writerow(data.values())
start=time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(write_csv())
print('总共耗时 %s s' % (time.time()-start) )