异步协程爬取福利姬

发表于 2022-04-14 更新于 2022-05-15 分类于爬虫阅读次数：
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
   File Name：     协程4-实战爬取tuao8.com
   Author :       chenci
   date：          2022/3/25
-------------------------------------------------
"""
import aiofiles
import requests
from lxml import etree
import asyncio
import aiohttp
from fake_useragent import UserAgent
import os
import time

def create_dir_not_exist(path):
    if not os.path.exists(path):
        os.makedirs(path)


# 抓取每个条目的图集
def get_photos_url():
    resp = requests.get(url='https://www.tuao8.xyz/category-2_1.html', headers=header)
    text = etree.HTML(resp.text)
    href_url_list = text.xpath('//*[@id="container"]/main/article/div/a/@href')
    return href_url_list


# 去请求每个图集.返回源码
async def get_photos(photo_list):
    # 限制并发熟路默认100,0为无限制
    conn = aiohttp.TCPConnector(limit=10)
    # 发送请求
    async with aiohttp.ClientSession(connector=conn) as session:
        async with await session.get(url=photo_list, headers=header) as resp:
            page_text = await resp.text()
            await get_photos_title_page(page_text, photo_list)


# 从每个源码里筛选出标题和最大页码,url
async def get_photos_title_page(text, url):
    tasks = []
    html = etree.HTML(text)
    title = html.xpath('//*[@id="container"]/main/article/h1/text()')[0]
    max_page = int(html.xpath('//*[@id="dm-fy"]/li[last()-1]/a/text()')[0])
    create_dir_not_exist(f'./imgs/tuzo_xc/{title}')
    task = asyncio.create_task(get_download_url(url=url, title=title, max_page=max_page))
    tasks.append(task)
    await asyncio.wait(tasks)


# 获取每一页的url并从源码中筛选出每张图片的下载链接
async def get_download_url(url, title, max_page):
    tasks = []
    for i in range(1, max_page):
        urls = f'{url}?page={i}'
        conn = aiohttp.TCPConnector(limit=10)
        async with aiohttp.ClientSession(connector=conn) as session:
            async with await session.get(url=urls, headers=header) as resp:
                page_text = await resp.text()
                html = etree.HTML(page_text)
                image_url = html.xpath('//*[@class="entry"]//img/@src')[0]
                task = asyncio.create_task(download_image(image_url, title, i))
                tasks.append(task)
    await asyncio.wait(tasks)


# 下载
async def download_image(image_url, title, i):
    conn = aiohttp.TCPConnector(limit=30)
    async with aiohttp.ClientSession(connector=conn) as session:
        try:
            async with await session.get(url=image_url, headers=header) as resp:
                print(image_url)
                async with aiofiles.open(f'./imgs/{title}/{i}.jpg', 'wb') as f:
                    print(f'正在下载{title}  第{i}张')
                    await f.write(await resp.read())
        except:
            pass
        print('下载完成')


async def main():
    href_url_list = get_photos_url()
    tasks = []
    for url in href_url_list:
        task = asyncio.create_task(get_photos(photo_list=url))
        tasks.append(task)
    await asyncio.wait(tasks)


if __name__ == '__main__':
    start = time.time()
    ua = UserAgent()
    header = {
        'Referer': 'https://www.tuao8.xyz/category-2_2.html',
        'user-agent': ua.random
    }
    asyncio.run(main())
    end = time.time()
    print('全部下载完成!耗时:', int(end - start), '秒')