异步协程爬取福利姬

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: 协程4-实战爬取tuao8.com
Author : chenci
date: 2022/3/25
-------------------------------------------------
"""
import aiofiles
import requests
from lxml import etree
import asyncio
import aiohttp
from fake_useragent import UserAgent
import os
import time

def create_dir_not_exist(path):
if not os.path.exists(path):
os.makedirs(path)


# 抓取每个条目的图集
def get_photos_url():
resp = requests.get(url='https://www.tuao8.xyz/category-2_1.html', headers=header)
text = etree.HTML(resp.text)
href_url_list = text.xpath('//*[@id="container"]/main/article/div/a/@href')
return href_url_list


# 去请求每个图集.返回源码
async def get_photos(photo_list):
# 限制并发熟路默认100,0为无限制
conn = aiohttp.TCPConnector(limit=10)
# 发送请求
async with aiohttp.ClientSession(connector=conn) as session:
async with await session.get(url=photo_list, headers=header) as resp:
page_text = await resp.text()
await get_photos_title_page(page_text, photo_list)


# 从每个源码里筛选出标题和最大页码,url
async def get_photos_title_page(text, url):
tasks = []
html = etree.HTML(text)
title = html.xpath('//*[@id="container"]/main/article/h1/text()')[0]
max_page = int(html.xpath('//*[@id="dm-fy"]/li[last()-1]/a/text()')[0])
create_dir_not_exist(f'./imgs/tuzo_xc/{title}')
task = asyncio.create_task(get_download_url(url=url, title=title, max_page=max_page))
tasks.append(task)
await asyncio.wait(tasks)


# 获取每一页的url并从源码中筛选出每张图片的下载链接
async def get_download_url(url, title, max_page):
tasks = []
for i in range(1, max_page):
urls = f'{url}?page={i}'
conn = aiohttp.TCPConnector(limit=10)
async with aiohttp.ClientSession(connector=conn) as session:
async with await session.get(url=urls, headers=header) as resp:
page_text = await resp.text()
html = etree.HTML(page_text)
image_url = html.xpath('//*[@class="entry"]//img/@src')[0]
task = asyncio.create_task(download_image(image_url, title, i))
tasks.append(task)
await asyncio.wait(tasks)


# 下载
async def download_image(image_url, title, i):
conn = aiohttp.TCPConnector(limit=30)
async with aiohttp.ClientSession(connector=conn) as session:
try:
async with await session.get(url=image_url, headers=header) as resp:
print(image_url)
async with aiofiles.open(f'./imgs/{title}/{i}.jpg', 'wb') as f:
print(f'正在下载{title}{i}张')
await f.write(await resp.read())
except:
pass
print('下载完成')


async def main():
href_url_list = get_photos_url()
tasks = []
for url in href_url_list:
task = asyncio.create_task(get_photos(photo_list=url))
tasks.append(task)
await asyncio.wait(tasks)


if __name__ == '__main__':
start = time.time()
ua = UserAgent()
header = {
'Referer': 'https://www.tuao8.xyz/category-2_2.html',
'user-agent': ua.random
}
asyncio.run(main())
end = time.time()
print('全部下载完成!耗时:', int(end - start), '秒')