import requests
from lxml import etree
import os
customize_path = 'D:/vmgirls'
if not os.path.exists(customize_path):
os.mkdir(customize_path)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
}
url = 'https://www.vmgirls.com/sitemap.html'
'''
从站点地图爬取所有图片详情页链接
'''
response = requests.get(url=url, headers=headers)
site_html = response.text
tree = etree.HTML(site_html)
li_list = tree.xpath('//*[@id="content"]/ul/li')
for li in li_list:
'''
解析出每一主题图片的详情页面和主题名称,并在本地创建相应文件夹存放
'''
title = li.xpath('a/@title')[0]
href = li.xpath('a/@href')[0]
title_folder = customize_path + '/' + title
if not os.path.exists(title_folder):
os.mkdir(title_folder)
res = requests.get(url=href, headers=headers).text
detail_tree = etree.HTML(res)
a_list = detail_tree.xpath('/html/body/main/div/div[2]/div/div/div/div[4]/div[3]/a')
'''
解析出详情页每一张图片的具体url地址
'''
for a in a_list:
img_url = a.xpath('@href')[0]
res = requests.get(url=img_url, headers=headers).content
img_path = title_folder + '/' + img_url.split('/')[-1]
if (os.path.exists(img_path)):
print(f"{href} 已存在")
continue
with open(img_path, 'wb') as fp:
fp.write(res)
print(f"{href} 爬取成功")
print(f"{title}系列爬取成功!")
效果如下
Q.E.D.