最近要搬家,有几十本书需要整理,想着先把他们分下类,为了避免思考哪本书应该归为哪类,想借助某买书网站的搜索来爬取相关分类信息。
把实体书分类
输入书名列表,返回书名与分类信息
整体流程
代码流程
import time
import requests
from bs4 import BeautifulSoup
def get_search_content_first_detail_url(book_name):
try:
url = 'http://search.XXXX.com/?key=%s&act=input' % book_name # 网址需要替换成常用买书的url
data = requests.get(url)
soup = BeautifulSoup(data.content, "lxml")
book_list_div = soup.find("div", id="search_nature_rg")
book_list = book_list_div.find("ul").find_all("li")
for book in book_list:
a = book.find("a")
return "http:" + a["href"]
return ""
except:
return ""
def get_book_class(url):
if url == "":
return ""
try:
header = {}
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
header['Accept'] = 'application/json, text/javascript, */*; q=0.01'
header['Cookie'] = 'XXX' # 自行替换,如果需要的登录的时候
data = requests.get(url, headers=header)
soup = BeautifulSoup(data.content, "lxml")
x = soup.find("li", class_="clearfix fenlei").find_all("a")
book_class_tier = []
for t in x:
book_class_tier.append(t.text)
return '/'.join(book_class_tier)
except:
return ""
file = open("book_name")
lines = file.readlines()
for line in lines:
time.sleep(1) # 请求慢点,防止封号
book_name = str.strip(line)
url = get_search_content_first_detail_url(book_name)
book_class = get_book_class(url)
print("%s\t%s" % (book_name, str.strip(book_class)))
header = {}
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
header['Accept'] = 'application/json, text/javascript, */*; q=0.01'