代码原创性问题
以下例程代码全部由本人所编写,供大家学习Python爬虫,以及练习Python操作Mysql数据库使用。
适用范围
这个小项目是本人才接触Python爬虫的时候所编写,出发点是熟悉爬虫数据爬取的特性,以及结合Mysql数据库的简单应用,所以功能上较为简单。代码注释较少但编写以见名知意为原则,并且用到的函数与逻辑均较为简单,本人认为不会过分影响阅读及学习。
适合才接触爬虫的新手学习,但需要一定耐心。
该例程可以做更多的事吗?
完全可以,
虽然该爬虫是一个单线程最基本的爬虫,但是对数据爬取的封装是独立分割成不同函数的,读者可以根据自己的学习以及使用需要对其进行增改,我相信对于准备接触数据挖掘、ML或者DL的小伙伴来说是一个不错的小工具。
import MySQLdb #用来连接Mysql数据库
import requests #用来爬取数据
import bs4 #用来处理爬取到的数据
注意
connect
函数中,即第7
行需要针对自己的Mysql配置进行修改
#封装的MySQLdb的方法类
class ToLocalMysql(object):
def __init__(self):
self.__flag = 0
def connect(self):
self.conn = MySQLdb.connect('127.0.0.1', 'root', '', '', 3306, charset='utf8')
self.cursor = self.conn.cursor()
self.cursor.execute("set names utf8;")
self.__flag = 1
def execute(self, sql):
if self.__flag == 1:
self.cursor.execute(sql)
result = self.cursor.fetchall()
returnOut = []
for i in result:
out = []
for j in i:
out.append(j)
print(out)
returnOut.append(out)
return returnOut
else:
print('execute() error : 连接未建立')
def close(self):
if self.__flag == 1:
self.conn.close()
self.__flag = 0
#爬取豆瓣top250数据的类
class DouBan(object):
def __init__(self, httpAddress):
print("豆瓣类创建成功", time.ctime())
originData = requests.get(httpAddress)
self.soup = bs4.BeautifulSoup(originData.text, 'lxml')
def find(self,keyWord, class_=""):
thing = self.soup.find_all(keyWord, class_)
out = []
for i in thing:
temp = i.string
if temp[0] != chr(160):
out.append(temp)
return out
def getNamegeOfSiglePage(self):
return self.find('span', class_="title")
def getScoreOfSiglePage(self):
return self.find('span', class_="rating_num")
def getYearOfSiglePage(self):
thing = self.soup.find_all('p', class_ = "")
out = []
for i in thing:
temp = i.contents[2].replace(chr(32), "")[1:5]
out.append(temp)
return out
def getCountryOfSiglePage(self):
thing = self.soup.find_all('p', class_ = "")
out = []
for i in thing:
temp = i.contents[2].replace(chr(32), "")[8:10]
out.append(temp)
return out
def getTypeOfSiglePage(self):
thing = self.soup.find_all('p', class_ = "")
out = []
for i in thing:
temp = (i.contents[2].split("/")[2])[1:-25]
out.append(temp)
return out
def getDirectorOfSiglePage(self):
thing = self.soup.find_all('p', class_ = "")
out = []
for i in thing:
temp = i.contents[0]
start = temp.find("导演")
end = temp.find("主演")
out.append(temp[start:end].split(" ")[1])
return out
def getNumOfJudgeOfSiglePage(self):
thing = self.soup.find_all('span', class_ = "")
out = []
for i in thing:
temp = i.contents
if temp != []:
temp = str(temp[0])
if len(temp) > 3:
if(temp[-1] == "价"):
out.append(temp[:-3])
return out
这里需要提前在Mysql数据库中建好表,表的列信息参考列名,也可以自行增改。
#传入库名和表名(已经存在)
#列名为(name, year, country, kind, drictor, score, judges)
def get_douban_top250_data(db, table):
del mysql
mysql = ToLocalMysql()
print(mysql)
mysql.close()
mysql.connect()
mysql.execute("use %s;"%db)
for loop in range(10):
if loop ==0:
address = 'https://movie.douban.com/top250'
else:
address = "https://movie.douban.com/top250?start=%d&filter="%(loop*25)
douban = DouBan(address)
name = douban.getNamegeOfSiglePage()
year = douban.getYearOfSiglePage()
country = douban.getCountryOfSiglePage()
director = douban.getDirectorOfSiglePage()
score = douban.getScoreOfSiglePage()
jugesNum = douban.getNumOfJudgeOfSiglePage()
kind = douban.getTypeOfSiglePage()
for i in range(len(name)):
print(str(name[i]), str(year[i]), str(country[i]), str(kind[i]),\
str(director[i]), float(score[i]), int(jugesNum[i]))
mysql.execute("insert into '%s'\n \
(name, year, country, kind, drictor, score, judges)\n\
values('%s', '%s', '%s', '%s', '%s', %f, %d);"\
%(table, str(name[i]), str(year[i]), \
str(country[i]), str(kind[i]), \
str(director[i]), float(score[i]), int(jugesNum[i])))
mysql.conn.commit()
mysql.close()