python多线程爬取图片_Python爬虫实战,python多线程抓取头像图片源码附exe程序及资源包...

Python爬虫实战,python多线程抓取头像图片源码附exe程序及资源包

python多线程抓取头像图片源码附exe程序及资源包!

1.使用到的库requests、etree、re、os、ThreadPool

2.网页编码为utf-8需要转码:html.encoding=“utf-8”

3.使用xpath获取图片链接

4.使用了多线程

5.需要输入页面n,具体可以看动态图片

6.头像首页为栏目页,没有页面,这里用了if判断

7.py打包exe命令:pyinstaller -F 目录文件.py

#by 微信:huguo00289

# -*- coding: utf-8 -*-

import requests

from lxml import etree

import re

import os

from multiprocessing.dummy import Pool as ThreadPool

def hqlj(n):

urls = []

for x in range(1,n+1):

url=f'https://www.woyaogexing.com/touxiang/index_{x}.html'

if x==1:

url='https://www.woyaogexing.com/touxiang/index.html'

print(url)

html=requests.get(url)

html.encoding="utf-8"

html=html.text

con=etree.HTML(html)

'''href=con.xpath('//div[@class="txList "]/a')

print(href)

for urls in href:

print(urls.attrib['href'])'''

href=con.xpath('//div[@class="txList "]/a/@href')

print(href)

for lj in href:

lj=f'https://www.woyaogexing.com{lj}'

print(lj)

urls.append(lj)

print(urls)

return urls

def hqtx(url):

#url="https://www.woyaogexing.com/touxiang/qinglv/2019/800160.html"

html=requests.get(url)

html.encoding="utf-8"

html=html.text

con=etree.HTML(html)

h1=con.xpath('//h1/text()')

h1=h1[0]

h1 = re.sub(r'[\|\/\\:\*\?\\\"]', "_", h1) # 剔除不合法字符

print(h1)

os.makedirs(f'./touxiang/{h1}/',exist_ok=True)

imgs=con.xpath('//img[@class="lazy"]/@src')

print(imgs)

i=1

for img in imgs:

img_url=f'https:{img}'

if 'jpeg' in img_url:

img_name=img_url[-5:]

else:

img_name = img_url[-4:]

n=str(i)

img_name='%s%s'%(n,img_name)

print(img_name)

print(img_url)

r=requests.get(img_url)

with open(f'./touxiang/{h1}/{img_name}','ab+') as f:

f.write(r.content)

print(f"保存{img_name}图片成功!")

i=i+1

#hqlj("https://www.woyaogexing.com/touxiang/")

if __name__ == '__main__':

n=input("请输入要采集的页码数:",)

n=int(n)

urls=(hqlj(n))

try:

# 开4个 worker,没有参数时默认是 cpu 的核心数

pool = ThreadPool()

results = pool.map(hqtx, urls)

pool.close()

pool.join()

print("采集所有头像完成!")

except:

print("Error: unable to start thread")

最后附上exe打包程序,需要的可以试试!

爬取了500页数据,分享给大家吧!总共1.71g!

看了下有部分数据编码好像有问题,大家凑合着用吧,不想改了!

你可能感兴趣的:(python多线程爬取图片)