python 优衣库图片爬虫

lzs482889275   ·   发表于 2022-06-03 11:37:19   ·   闲聊灌水区

import time
from time import sleep
import pandas as pd
import requests
from lxml import etree

def get_html(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36’
}
html = requests.get(url, headers=headers)
html.encoding = html.apparent_encoding
return html.text

def parse_html(html):
html = etree.HTML(html)
tables = html.xpath(“//div[@class=’TypeList’]/ul”)

# print(tables)
for t in tables:
    imgurl = t.xpath('.//a/img/@src')
    return imgurl

for i in range(15):
if i == 0:
url = ‘https://www.umeitu.com/bizhitupian/huyanbizhi/index.htm
else:
url = ‘https://www.umeitu.com/bizhitupian/huyanbizhi/index_{}.htm'.format(i)

# print(url)
# print('https://www.umeitu.com/bizhitupian/huyanbizhi/index_{}.htm'.format(i))
sleep(1)
html = get_html(url)
print(parse_html(html))

打赏我,让我更有动力~

0 Reply   |  Until 4个月前 | 408 View
LoginCan Publish Content
返回顶部 投诉反馈

© 2016 - 2022 掌控者 All Rights Reserved.