python爬取豆瓣电影250_python爬取豆瓣电影top250数据存入数据库

#-*- coding:utf-8 -*-

"""获取时光影评电影"""

importrequestsfrom bs4 importBeautifulSoupfrom datetime importdatetime,timedeltaimportpymysql#用来操作数据库的类

classMySqlCommand(object):#类的初始化

def __init__(self):

self.host= "127.0.0.1"self.port= 3306 #端口号

self.user = "root" #用户名

self.password = "" #密码

self.db = "" #库

self.table = "" #表

#连接数据库

defconnectMysql(self):try:

self.conn= pymysql.connect(host=self.host,port=self.port,user=self.user,

passwd=self.password,db=self.db,charset='utf8')

self.cursor=self.conn.cursor()returnself.cursor,self.connexcept:print('connect mysql error.')#获取指定开始排行的电影url

defget_url(root_url,start):return root_url+"?start="+str(start)+"&"

defget_review(page_url):"""获取电影相关的信息"""cursor,db=MySqlCommand().connectMysql()#creat_table = """CREATTE TABLE douban(id INT (11) NOT NULL AUTO_INCREMENT PRIMARY KEY,rank VARCHAR(128),title VARCHAR(128),score VARCHAR(128),descs VARCHAR(128))"""

creat_table =("CREATE TABLE douban("

"rank varchar(255),"

"title varchar(255),"

"score varchar(255),"

"descs varchar(255))")

cursor.execute("DROP TABLE IF EXISTS douban")

cursor.execute(creat_table)

movies_list=[]

reponse=requests.get(page_url)

soup=BeautifulSoup(reponse.text,'lxml')

soup= soup.find("ol","grid_view")

dict={}for tag_li in soup.find_all("li"):

dict={}

dict['rank'] = tag_li.find("em").string

dict['title'] = tag_li.find_all("span","title")[0].string

dict['score'] = tag_li.find("span","rating_num").stringif tag_li.find("span","inq"):

dict['desc'] =tag_li.find("span","inq").stringelse:

dict['desc'] = '无评词'cursor.execute("INSERT INTO douban(rank,title,score,descs)\

VALUES(%s,%s,%s,%s)",\

(dict['rank'],dict['title'],dict['score'],dict['desc']))

db.commit()

db.close()#movies_list.append(dict)

#return movies_list

if __name__ == '__main__':

root_url= "https://movie.douban.com/top250"start=0

movies_list=get_review(get_url(root_url,start))#for movies in movies_list:

#print(movies)

你可能感兴趣的:(python爬取豆瓣电影250)