记录-爬虫

爬取考古文物信息和图片代码
1、爬取图片url并下载
2、爬取相关信息存入txt

import os
import bs4
import requests
import re
from bs4 import BeautifulSoup
#文本信息保存在test.txt
from lxml import html

def look_img(i):
    # 抓取图片地址
    # 抓取img标签
    img_src = soup.findAll("img")
    n = 1
    url_img = []  # 保存需要爬取图片的地址
    for img in img_src:
        n = n + 1
        img = img.get('src')  # 抓取src
        if (n == 5):
            url_img = img
    print(url_img)
    # 保存图片
    root = "C://Users//123//Desktop//images//"  # 保存的根目录
    path = root + str(i) + ".jpg"  # 保存的地址
    try:
        if not os.path.exists(root):  # 判断根目录是否存在
            os.mkdir(root)
        if not os.path.exists(path):  # 如果文件不存在就爬取并保存
            r = requests.get(url_img)
            with open(path, 'wb') as f:  # 'wb'以二进制格式打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
                f.write(r.content)  # content返回二进制数据,所以使用'wb'
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失败")



def info(tex):
    name = re.search(r'原名:.*
年代:'
, tex) name = name[0].replace(r'原名:', '') name = name.replace(r'
年代:'
, '') #print(name) year = re.search(r'年代:.*
类别:'
, tex) year = year[0].replace(r'年代:', '') year = year.replace(r'
类别:'
, '') #print(year) category = re.search(r'类别:.*
'
, tex) category = category[0].replace(r'类别:', '') category = category.replace(r'
'
, '') #print(category) company = re.search(r'收藏单位:.*', tex) company = company[0].replace(r'收藏单位:', '') company = company.replace(r'
', '') #print(canpany) return str(name)+'\t'+str(year)+'\t'+str(category)+'\t'+str(company) file = open("test.txt",'w',encoding='utf-8') url='http://www.cd3000y.com/html/movablerelics/A-01-0000001-1255258500.html' r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo,"html.parser") title = soup.findAll('h1') # 查找h1标签 title = title[0].string desc = soup.findAll('div',{'id':'Description'}) #抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(1) for i in range(2,7): url = 'http://www.cd3000y.com/html/movablerelics/A-01-000000'+str(i)+'-1246375684.html' print(url) r = requests.get(url,timeout = 30) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo,"html.parser") title = soup.findAll('h1') ##查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) for i in range(7, 28): if (i<10): url = 'http://www.cd3000y.com/html/movablerelics/A-01-000000' + str(i) + '-1273152644.html' else: url = 'http://www.cd3000y.com/html/movablerelics/A-01-00000' + str(i) + '-1273152644.html' print(url) r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") title = soup.findAll('h1') #查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) for i in range(28, 53): url = 'http://www.cd3000y.com/html/movablerelics/A-01-00000' + str(i) + '-1260091908.html' print(url) r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") title = soup.findAll('h1') # 查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) for i in range(53, 75): url = 'http://www.cd3000y.com/html/movablerelics/A-01-00000' + str(i) + '-1286869892.html' print(url) r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") title = soup.findAll('h1') # 查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) for i in range(75, 93): url = 'http://www.cd3000y.com/html/movablerelics/A-01-00000' + str(i) + '-1277987076.html' print(url) r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") title = soup.findAll('h1') # 查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) for i in range(93, 117): if (i<100): url = 'http://www.cd3000y.com/html/movablerelics/A-01-00000' + str(i) + '-1296383620.html' else: url= 'http://www.cd3000y.com/html/movablerelics/A-01-0000' + str(i) + '-1296383620.html' print(url) r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") title = soup.findAll('h1') # 查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) for i in range(117, 139): url = 'http://www.cd3000y.com/html/movablerelics/A-01-0000' + str(i) + '-1325257732.html' print(url) r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") title = soup.findAll('h1') # 查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) for i in range(139, 156): url = 'http://www.cd3000y.com/html/movablerelics/A-01-0000' + str(i) + '-1310084484.html' print(url) r = requests.get(url) r.encoding = r.apparent_encoding demo = r.text soup = BeautifulSoup(demo, "html.parser") title = soup.findAll('h1') # 查找h1标签 title = title[0].string desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 tex = str(desc) information = info(tex) file.write(title+"\n"+"\t"+"\t"+information+url+"\n") look_img(i) file.close()

匹配满足条件的网页,保存url到txt中

import random
import time
import requests
from bs4 import BeautifulSoup
import os
import re

url = 'http://www.cd3000y.com/html/movablerelics/'
r = requests.get(url)
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html,"html.parser")
aim_list = ['尊','樽','壶','爵','角','觥','觚','彝','卣','罍','瓿','杯','卮','缶','豆','斝','盉','觯','瓮','钵','方彝','斗','碗','区','皿','鉴','斛','舟','羽觞','注子','温酒器']

file = open("C:\\Users\\123\\Desktop\\process"+".txt",'a',encoding='utf-8')
url_list = soup.findAll('a')
web_list = []
error_list = []
for i in url_list:
    url = i.get('href')
    url = "http://www.cd3000y.com"+url
    web_list.append(url)

for i in range(len(web_list)):
    url = web_list[i]
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
    except:
        continue
    html = r.text
    soup = BeautifulSoup(html,"html.parser")
    title = soup.findAll('h1')
    title = str(title)
    # if(title=='None'):
    #     error_list.append(url)
    #     continue
    for j in range(0,31):
        result = re.search(aim_list[j],title)
        if(result!=None):
            print(result)
            file.write(url+'\n')
            print(url)
            break
    time.sleep(random.random() * 3)

从txt文件中得到url,并提取信息
保存到文本数据excel中、下载img

import os
import re
import time

import requests
import xlwt
from bs4 import BeautifulSoup


def look_img(i,s):
    # 抓取图片地址
    # 抓取img标签
    img_src = soup.findAll("img")
    n = 1
    url_img = []  # 保存需要爬取图片的地址
    for img in img_src:
        n = n + 1
        img = img.get('src')  # 抓取src
        if (n == 5):
            url_img = img
    print(url_img)
    # 保存图片
    root = "C://Users//123//Desktop//images//"  # 保存的根目录
    path = root + str(i)+s+ ".jpg"  # 保存的地址
    try:
        if not os.path.exists(root):  # 判断图片是否存在
            os.mkdir(root)
        if not os.path.exists(path):  # 如果图片不存在就爬取并保存
            r = requests.get(url_img)
            with open(path, 'wb') as f:  # 'wb'以二进制格式打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
                f.write(r.content)  # content返回二进制数据,所以使用'wb'
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失败")

def info(tex,i):
    name = re.search(r'原名:.*
年代:'
, tex) name = name[0].replace(r'原名:', '') name = name.replace(r'
年代:'
, '') worksheet.write(i,1,name) #print(name) year = re.search(r'年代:.*
类别:'
, tex) year = year[0].replace(r'年代:', '') year = year.replace(r'
类别:'
, '') worksheet.write(i, 2, year) #print(year) category = re.search(r'类别:.*
'
, tex) category = category[0].replace(r'类别:', '') category = category.replace(r'
'
, '') worksheet.write(i, 3, category) #print(category) company = re.search(r'收藏单位:.*', tex) company = company[0].replace(r'收藏单位:', '') company = company.replace(r'
', '') worksheet.write(i, 4, company) #print(canpany) return str(name)+'\t'+str(year)+'\t'+str(category)+'\t'+str(company) #逐行读取信息 file = open("test.txt",'w',encoding = 'utf-8') myxls = xlwt.Workbook() #创建 workbook 即新建 excel 文件/工作簿 worksheet = myxls.add_sheet(u'my_worksheet', cell_overwrite_ok=True)#创建工作表,如果想创建多个工作表,直接在后面再 add_sheet #worksheet.write(0,0,Value) #写入数据,共 3 个参数,第一个参数表示行,从 0 开始,第二个参数表示列从 0 开始,第三个参数表示插入的数值 #workbook.save('my_worksheet.xlsx') #写完记得一定要保存 n = 0 i = 0 file_name = 'process.txt' with open(file_name) as file_obj: for url in file_obj: try: # isString = isinstance(url, str) # print(isString) url =url.strip('\n') #去掉回车 print(url) n = n + 1 kv = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1 Edg/84.0.4147.89'} #通过headers字段模拟浏览器请求 r = requests.get(url,headers = kv) time.sleep(0.5) print(r.status_code) r.encoding = r.apparent_encoding html = r.text soup = BeautifulSoup(html,'html.parser') title = soup.findAll('h1') # 查找h1标签 title = title[0].string title = title.strip() desc = soup.findAll('div', {'id': 'Description'}) # 抓取div标签且id为Description的所有内容 #保存图片 s = title print(s) # look_img(i,s) #保存文本信息 worksheet.write(i, 0, title) #保存标题 worksheet.write(i, 5, url) #保存链接 tex = str(desc) information = info(tex,i) i = i+1 #file.write(title+"\n"+"\t"+"\t"+information+'\n'+url+"\n") except: myxls.save('my_worksheet.xls') myxls.save('my_worksheet.xls')

你可能感兴趣的:(无脑的Python笔记)