代码:
'''
# 获取当前Python程序的执行路径,如果需要改变添加包的导入查找路径,在这里路径设置
import os
print(os.getcwd())
'''
'''
# 获取当前Python程序的安装路径
import os
print(os.getcwd())
'''
# http://www.521609.com/uploads/allimg/111114/1102411R51-1-lp.jpg
import requests,re,os,csv
from bs4 import BeautifulSoup
# 伪造请求头标签,伪造成浏览器不然会请求失败
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
a = requests.get("http://www.521609.com/meinvxiaohua/",headers=header)
# 查看网页编码是什么类型之后再将响应对象的encoding属性设置成响应网页编码格式,不然会乱码,如下gb2312
a.encoding = 'gb2312'
b = a.text
print(b,type(b))
soup = BeautifulSoup(b,"lxml")
print(type(soup))
#将一个正则表达式编译成一个对象后期直接调用来匹配,正则匹配的话不带括号的话匹配所有整个表达式,带括号的话匹配出来的是括号的东西
q = re.compile(r'src="(.*)" width')
#findall查找所有符合条件的字符串然后讲这些符合条件的生成一个大列表
e = q.findall(b)
print(e)
#这里写入csv文件表头
head = ["序号", "资源地址"]
with open("weihangcsv.csv", mode="w", encoding="utf-8", newline="") as file:
w = csv.writer(file)
w.writerow(head)
num = 1
r = []
# 这里写入csv文件的内容,a模式是写模式的升级版,追加写入,在原有的文件里面从顶部能写入的行开始往下写,writerow是写一行的方法
with open("weihangcsv.csv", mode="a", encoding="utf-8", newline="") as file:
for i in e:
r.append('http://www.521609.com' + i)
list1 = [num,'http://www.521609.com' + i]
num +=1
w = csv.writer(file)
w.writerow(list1)
# os.path.join()好像是只能拼接文件地址,不能拼接网址
# r.append(os.path.join('http://www.521609.com',i))
# print(os.path.join('http://www.521609.com',i))
print(r)
# c = soup.find_all("a")
# print(c)
# for d in c:
# c.attrs
将字典对象写入到对应的目标文件(文件里面内容为json字符串格式,json就是字符串)
# dic是一个字典格式对象
with open("doubanjson.json", mode="a", encoding="utf-8") as file:
json.dump(dic, file,ensure_ascii=False)
将爬取到的的内容写入txt文件
with open("doubanre.txt",mode="a",encoding="utf-8") as file:
file.write("{0}\t{1}\t{2}\t{3}\t图书网址:{4}\t图片网址:{5}\n".format(name,name1,name2,name3,name4,name5))
数据写到数据库
# 连接数据库
con = pymysql.connect(host="localhost", port=3306, user="root",password="root", db="doubanread")
# 获取数据库游标
cur = self.con.cursor()
# 给数据库写入
try:
cur.execute("insert into bookbox(bookname,writename,writepath,writedate) values(%s,%s,%s,%s)",(name1,name2,name3,name4))
con.commit()
except:
con.rollback()
完整版连带图片载到本地操作(已在centos上验证)
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
a = requests.get("http://www.521609.com/meinvxiaohua/",headers=header)
# 网页编码是什么类型之后再encoding中转化成什么类型的编码不然会乱码
a.encoding = 'gb2312'
b = a.text
# c = encode(encoding='utf-8')
print(b,type(b))
soup = BeautifulSoup(b,"lxml")
print(type(soup))
q = re.compile(r'src="(.*)" width')
e = q.findall(b)
print(e)
#这里写入csv文件
head = ["序号", "资源地址"]
with open("weihangcsv.csv", mode="w", encoding="utf-8", newline="") as file:
w = csv.writer(file)
w.writerow(head)
num = 1
r = []
with open("weihangcsv.csv", mode="a", encoding="utf-8", newline="") as file:
for i in e:
r.append('http://www.521609.com' + i)
list1 = [num,'http://www.521609.com' + i]
# 这里是请求图片的url资源,然后返回响应,之后使用这 个二进制响应进行图片下载操作,也就是将二进制数据写入到新文件当中这个就是图片的下载。
imgfile = requests.get('http://www.521609.com' + i,header)
i = imgfile.content
num +=1
# 这里用上面使用图片url请求到的二进制内容来进行写入操作(也就是下载图片的操作,写完了最下面的两行是将之前得到的数据写入到csv文件当中去记录)
with open("imgbag/weihangimg{0}.jpg".format(num), mode="wb") as im:
im.write(i)
w = csv.writer(file)
w.writerow(list1)
# os.path.join()好像是只能拼接文件地址,不能拼接网址
# r.append(os.path.join('http://www.521609.com',i))
# print(os.path.join('http://www.521609.com',i))
print(r)
使用代理ip
代理Ip使用博客