Python数据提取分析写成csv文件以及将其抓取到的图片网页二次请求进行图片下载到本地的操作

代码:


'''
# 获取当前Python程序的执行路径,如果需要改变添加包的导入查找路径,在这里路径设置
import os
print(os.getcwd())
'''
'''
# 获取当前Python程序的安装路径
import os
print(os.getcwd())
'''

# http://www.521609.com/uploads/allimg/111114/1102411R51-1-lp.jpg
import requests,re,os,csv
from bs4 import BeautifulSoup
# 伪造请求头标签,伪造成浏览器不然会请求失败
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
a = requests.get("http://www.521609.com/meinvxiaohua/",headers=header)
# 查看网页编码是什么类型之后再将响应对象的encoding属性设置成响应网页编码格式,不然会乱码,如下gb2312
a.encoding = 'gb2312'
b  = a.text
print(b,type(b))
soup = BeautifulSoup(b,"lxml")
print(type(soup))
#将一个正则表达式编译成一个对象后期直接调用来匹配,正则匹配的话不带括号的话匹配所有整个表达式,带括号的话匹配出来的是括号的东西
q = re.compile(r'src="(.*)" width')
#findall查找所有符合条件的字符串然后讲这些符合条件的生成一个大列表
e = q.findall(b)
print(e)
#这里写入csv文件表头
head = ["序号", "资源地址"]
with open("weihangcsv.csv", mode="w", encoding="utf-8", newline="") as file:
    w = csv.writer(file)
    w.writerow(head)
num = 1
r = []
# 这里写入csv文件的内容,a模式是写模式的升级版,追加写入,在原有的文件里面从顶部能写入的行开始往下写,writerow是写一行的方法
with open("weihangcsv.csv", mode="a", encoding="utf-8", newline="") as file:
    for i in e:
        r.append('http://www.521609.com' + i)
        list1 = [num,'http://www.521609.com' + i]
        num +=1
        w = csv.writer(file)
        w.writerow(list1)
    # os.path.join()好像是只能拼接文件地址,不能拼接网址
    # r.append(os.path.join('http://www.521609.com',i))
    # print(os.path.join('http://www.521609.com',i))
print(r)

# c = soup.find_all("a")
# print(c)
# for d in c:
#     c.attrs

将字典对象写入到对应的目标文件(文件里面内容为json字符串格式,json就是字符串)

# dic是一个字典格式对象
with open("doubanjson.json", mode="a", encoding="utf-8") as file:
            json.dump(dic, file,ensure_ascii=False)

将爬取到的的内容写入txt文件

with open("doubanre.txt",mode="a",encoding="utf-8") as file:
            file.write("{0}\t{1}\t{2}\t{3}\t图书网址:{4}\t图片网址:{5}\n".format(name,name1,name2,name3,name4,name5))

数据写到数据库

# 连接数据库
con = pymysql.connect(host="localhost", port=3306, user="root",password="root", db="doubanread")
# 获取数据库游标
cur = self.con.cursor()
# 给数据库写入
try:
            cur.execute("insert into bookbox(bookname,writename,writepath,writedate) values(%s,%s,%s,%s)",(name1,name2,name3,name4))
            con.commit()
        except:

            con.rollback()

完整版连带图片载到本地操作(已在centos上验证)

header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
a = requests.get("http://www.521609.com/meinvxiaohua/",headers=header)
# 网页编码是什么类型之后再encoding中转化成什么类型的编码不然会乱码
a.encoding = 'gb2312'
b  = a.text
# c = encode(encoding='utf-8')
print(b,type(b))
soup = BeautifulSoup(b,"lxml")
print(type(soup))
q = re.compile(r'src="(.*)" width')
e = q.findall(b)
print(e)
#这里写入csv文件
head = ["序号", "资源地址"]
with open("weihangcsv.csv", mode="w", encoding="utf-8", newline="") as file:
    w = csv.writer(file)
    w.writerow(head)
num = 1
r = []
with open("weihangcsv.csv", mode="a", encoding="utf-8", newline="") as file:
    for i in e:
        r.append('http://www.521609.com' + i)
        list1 = [num,'http://www.521609.com' + i]
        # 这里是请求图片的url资源,然后返回响应,之后使用这 个二进制响应进行图片下载操作,也就是将二进制数据写入到新文件当中这个就是图片的下载。
        imgfile = requests.get('http://www.521609.com' + i,header)
        i = imgfile.content
        num +=1
       # 这里用上面使用图片url请求到的二进制内容来进行写入操作(也就是下载图片的操作,写完了最下面的两行是将之前得到的数据写入到csv文件当中去记录)
        with open("imgbag/weihangimg{0}.jpg".format(num), mode="wb") as im:
            im.write(i)
            w = csv.writer(file)
            w.writerow(list1)
    # os.path.join()好像是只能拼接文件地址,不能拼接网址
    # r.append(os.path.join('http://www.521609.com',i))
    # print(os.path.join('http://www.521609.com',i))
print(r)

使用代理ip
代理Ip使用博客

你可能感兴趣的:(爬取文件,写入csv文件)