爬取考古文物信息和图片代码
1、爬取图片url并下载
2、爬取相关信息存入txt
import os
import bs4
import requests
import re
from bs4 import BeautifulSoup
#文本信息保存在test.txt
from lxml import html
def look_img(i):
# 抓取图片地址
# 抓取img标签
img_src = soup.findAll("img")
n = 1
url_img = [] # 保存需要爬取图片的地址
for img in img_src:
n = n + 1
img = img.get('src') # 抓取src
if (n == 5):
url_img = img
print(url_img)
# 保存图片
root = "C://Users//123//Desktop//images//" # 保存的根目录
path = root + str(i) + ".jpg" # 保存的地址
try:
if not os.path.exists(root): # 判断根目录是否存在
os.mkdir(root)
if not os.path.exists(path): # 如果文件不存在就爬取并保存
r = requests.get(url_img)
with open(path, 'wb') as f: # 'wb'以二进制格式打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
f.write(r.content) # content返回二进制数据,所以使用'wb'
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
def info(tex):
name = re.search(r'原名:.*
年代:', tex)
name = name[0].replace(r'原名:', '')
name = name.replace(r'
年代:', '')
#print(name)
year = re.search(r'年代:.*
类别:', tex)
year = year[0].replace(r'年代:', '')
year = year.replace(r'
类别:', '')
#print(year)
category = re.search(r'类别:.*
', tex)
category = category[0].replace(r'类别:', '')
category = category.replace(r'
', '')
#print(category)
company = re.search(r'收藏单位:.*', tex)
company = company[0].replace(r'收藏单位:', '')
company = company.replace(r'
匹配满足条件的网页,保存url到txt中
import random
import time
import requests
from bs4 import BeautifulSoup
import os
import re
url = 'http://www.cd3000y.com/html/movablerelics/'
r = requests.get(url)
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html,"html.parser")
aim_list = ['尊','樽','壶','爵','角','觥','觚','彝','卣','罍','瓿','杯','卮','缶','豆','斝','盉','觯','瓮','钵','方彝','斗','碗','区','皿','鉴','斛','舟','羽觞','注子','温酒器']
file = open("C:\\Users\\123\\Desktop\\process"+".txt",'a',encoding='utf-8')
url_list = soup.findAll('a')
web_list = []
error_list = []
for i in url_list:
url = i.get('href')
url = "http://www.cd3000y.com"+url
web_list.append(url)
for i in range(len(web_list)):
url = web_list[i]
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
except:
continue
html = r.text
soup = BeautifulSoup(html,"html.parser")
title = soup.findAll('h1')
title = str(title)
# if(title=='None'):
# error_list.append(url)
# continue
for j in range(0,31):
result = re.search(aim_list[j],title)
if(result!=None):
print(result)
file.write(url+'\n')
print(url)
break
time.sleep(random.random() * 3)
从txt文件中得到url,并提取信息
保存到文本数据excel中、下载img
import os
import re
import time
import requests
import xlwt
from bs4 import BeautifulSoup
def look_img(i,s):
# 抓取图片地址
# 抓取img标签
img_src = soup.findAll("img")
n = 1
url_img = [] # 保存需要爬取图片的地址
for img in img_src:
n = n + 1
img = img.get('src') # 抓取src
if (n == 5):
url_img = img
print(url_img)
# 保存图片
root = "C://Users//123//Desktop//images//" # 保存的根目录
path = root + str(i)+s+ ".jpg" # 保存的地址
try:
if not os.path.exists(root): # 判断图片是否存在
os.mkdir(root)
if not os.path.exists(path): # 如果图片不存在就爬取并保存
r = requests.get(url_img)
with open(path, 'wb') as f: # 'wb'以二进制格式打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
f.write(r.content) # content返回二进制数据,所以使用'wb'
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
def info(tex,i):
name = re.search(r'原名:.*
年代:', tex)
name = name[0].replace(r'原名:', '')
name = name.replace(r'
年代:', '')
worksheet.write(i,1,name)
#print(name)
year = re.search(r'年代:.*
类别:', tex)
year = year[0].replace(r'年代:', '')
year = year.replace(r'
类别:', '')
worksheet.write(i, 2, year)
#print(year)
category = re.search(r'类别:.*
', tex)
category = category[0].replace(r'类别:', '')
category = category.replace(r'
', '')
worksheet.write(i, 3, category)
#print(category)
company = re.search(r'收藏单位:.*', tex)
company = company[0].replace(r'收藏单位:', '')
company = company.replace(r'