import requests
from lxml import etree
import os

customize_path = 'D:/vmgirls'

if not os.path.exists(customize_path):
    os.mkdir(customize_path)


headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',

}
url = 'https://www.vmgirls.com/sitemap.html'


'''
从站点地图爬取所有图片详情页链接
'''
response = requests.get(url=url, headers=headers)
site_html = response.text

tree = etree.HTML(site_html)
li_list = tree.xpath('//*[@id="content"]/ul/li')

for li in li_list:
    '''
    解析出每一主题图片的详情页面和主题名称,并在本地创建相应文件夹存放
    '''
    title = li.xpath('a/@title')[0]
    href = li.xpath('a/@href')[0]

    title_folder = customize_path + '/' + title
    if not os.path.exists(title_folder):
        os.mkdir(title_folder)

    res = requests.get(url=href, headers=headers).text

    detail_tree = etree.HTML(res)
    a_list = detail_tree.xpath('/html/body/main/div/div[2]/div/div/div/div[4]/div[3]/a')
    '''
    解析出详情页每一张图片的具体url地址
    '''
    for a in a_list:
        img_url = a.xpath('@href')[0]
        res = requests.get(url=img_url, headers=headers).content
        img_path = title_folder + '/' + img_url.split('/')[-1]
        if (os.path.exists(img_path)):
            print(f"{href} 已存在")
            continue
        with open(img_path, 'wb') as fp:
            fp.write(res)
            print(f"{href} 爬取成功")
    print(f"{title}系列爬取成功!")

效果如下
asfksgnkdfnhidfnkdf20220617090258

Q.E.D.


一个二次元web开发咸鱼