1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
| """ ------------------------------------------------- File Name: 协程4-实战爬取tuao8.com Author : chenci date: 2022/3/25 ------------------------------------------------- """ import aiofiles import requests from lxml import etree import asyncio import aiohttp from fake_useragent import UserAgent import os import time
def create_dir_not_exist(path): if not os.path.exists(path): os.makedirs(path)
def get_photos_url(): resp = requests.get(url='https://www.tuao8.xyz/category-2_1.html', headers=header) text = etree.HTML(resp.text) href_url_list = text.xpath('//*[@id="container"]/main/article/div/a/@href') return href_url_list
async def get_photos(photo_list): conn = aiohttp.TCPConnector(limit=10) async with aiohttp.ClientSession(connector=conn) as session: async with await session.get(url=photo_list, headers=header) as resp: page_text = await resp.text() await get_photos_title_page(page_text, photo_list)
async def get_photos_title_page(text, url): tasks = [] html = etree.HTML(text) title = html.xpath('//*[@id="container"]/main/article/h1/text()')[0] max_page = int(html.xpath('//*[@id="dm-fy"]/li[last()-1]/a/text()')[0]) create_dir_not_exist(f'./imgs/tuzo_xc/{title}') task = asyncio.create_task(get_download_url(url=url, title=title, max_page=max_page)) tasks.append(task) await asyncio.wait(tasks)
async def get_download_url(url, title, max_page): tasks = [] for i in range(1, max_page): urls = f'{url}?page={i}' conn = aiohttp.TCPConnector(limit=10) async with aiohttp.ClientSession(connector=conn) as session: async with await session.get(url=urls, headers=header) as resp: page_text = await resp.text() html = etree.HTML(page_text) image_url = html.xpath('//*[@class="entry"]//img/@src')[0] task = asyncio.create_task(download_image(image_url, title, i)) tasks.append(task) await asyncio.wait(tasks)
async def download_image(image_url, title, i): conn = aiohttp.TCPConnector(limit=30) async with aiohttp.ClientSession(connector=conn) as session: try: async with await session.get(url=image_url, headers=header) as resp: print(image_url) async with aiofiles.open(f'./imgs/{title}/{i}.jpg', 'wb') as f: print(f'正在下载{title} 第{i}张') await f.write(await resp.read()) except: pass print('下载完成')
async def main(): href_url_list = get_photos_url() tasks = [] for url in href_url_list: task = asyncio.create_task(get_photos(photo_list=url)) tasks.append(task) await asyncio.wait(tasks)
if __name__ == '__main__': start = time.time() ua = UserAgent() header = { 'Referer': 'https://www.tuao8.xyz/category-2_2.html', 'user-agent': ua.random } asyncio.run(main()) end = time.time() print('全部下载完成!耗时:', int(end - start), '秒')
|