本次实战目的是为了更好地学习MySQL和mongoBD 操作,需要从猫眼电影上爬取电影信息,加上简单的python GUI界面操作,效果图如下:
废话不多说,首先导入本次项目需要导入的模块,以及主程序入口代码如下:
import requests
from bs4 import BeautifulSoup
import re
import pymysql
from pymongo import MongoClient
import time
import wx
#每次爬取之前都将新建表,将以前的表删除
def create_table_mysql():
# 打开数据库连接
db = pymysql.connect(host="localhost", user='root', password='Lizzie94', port=3306, db='Movies_mao')
cursor = db.cursor()
cursor.execute("drop table if EXISTS movie ")
sql = """create table movie(
id int UNSIGNED not null auto_increment,
name char(50) not null,
actor varchar(400) not null,
time DATE not null,
score FLOAT,
PRIMARY KEY (id))ENGINE = InnoDB Default charset=UTF8MB4;
"""
cursor.execute(sql)
db.close()
"""下载网页"""
def crawurl(url):
try:
r = requests.get(url)
if r.status_code == 200:
return r.text
else:
print('request failed, status is {}'.format(r.status_code))
return None
except Exception as e:
print(e)
return None
"""解析网页"""
def parse(html):
soup = BeautifulSoup(html, 'html.parser')
detail = soup.find_all('div', "board-item-content")
for item in detail:
movie_name = item.find("p", class_="name").a.text
movie_star = re.sub(r'\s+', '', item.find("p", class_="star").text).replace("主演:", "")
movie_release_time = re.sub(r'\((.*)\)', '', item.find("p", class_="releasetime").text.replace("上映时间:", ""))
movie_score = ''.join([item.find('i', class_="integer").text, item.find('i', class_="fraction").text])
yield {'name': movie_name,
'actor': movie_star,
'time': movie_release_time,
'score': movie_score
}
"""写入Mysql数据库"""
def load_to_mysql(data):
values = ",".join(["%s"] * len(data))
keys = ",".join(data.keys())
movie_item = tuple(data.values())
# 打开数据库连接
db = pymysql.connect(host="localhost", user='root', password='Lizzie94',port=3306,db= 'Movies_mao')
cursor = db.cursor()
# 插入数据
sql_insert = "insert into movie({keys}) values ({values})".format(keys=keys,values=values)
try:
if cursor.execute(sql_insert, movie_item):
print('insert successfully')
db.commit()
except Exception as e:
print("failed", e.args)
db.rollback()
db.close()
"""写入MangoDB"""
def load_to_mangoDB(data):
client = MongoClient('localhost') # 连接客户端
db = client.Mao_Movie # 创建数据库“Mao_Movie" 若数据库不存在,存在则连接数据库
post = db.Movie # 创建集合Movie,若集合不存在,存在则连接集合
# post.remove(None)
try:
if post.insert(data):
print('insert MongoDB successfully')
except Exception as e:
print('insert MongoDB failed', e.args)
"""点击 crawl_button 按钮触发主调用函数开始爬取电影"""
def main(event):
base_url = 'https://maoyan.com/board/4?offset='
page = 11
# 爬数据之前先建表
create_table_mysql()
for i in range(1, page):
url = base_url + str((i-1) * 10)
html = crawurl(url)
for item in parse(html):
print(item)
load_to_mysql(item)
load_to_mangoDB(item)
wx.MessageBox("crawl movies successfully", "Message", wx.OK | wx.ICON_INFORMATION)
以下代码是整个项目的主入口程序,需要用到wx建立一个GUI界面,按钮crawl_button 绑定main()事件,点击并开始触发开始爬取电影事件。
if __name__ == '__main__':
t1 = time.time()
# 界面代码
app = wx.App()
frame = wx.Frame(None, title="Spide movie", pos=(1000, 200), size=(500, 400))
panel = wx.Panel(frame)
lb_box = wx.BoxSizer(wx.HORIZONTAL)
# 静态文本,放入水平容器中, 1:1 比例
lb_srch_cont = wx.StaticText(panel, -1, '搜索内容:')
lb_srch_type = wx.StaticText(panel, -1, '搜索类别:')
lb_box.Add(lb_srch_cont, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
lb_box.Add(lb_srch_type, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
text_box = wx.BoxSizer(wx.HORIZONTAL)
# 输入搜索内容框
content_text = wx.TextCtrl(panel, -1)
list_type = ['演员名字', '电影名字']
# 下拉列表框
type_combox = wx.ComboBox(panel, -1, choices=list_type)
# 搜素按钮button
srch_button = wx.Button(panel, label="搜索")
# 绑定搜索事件
srch_button.Bind(wx.EVT_BUTTON, hit_me)
text_box.Add(content_text, proportion=3, flag=wx.EXPAND | wx.ALL, border=3)
text_box.Add(type_combox, proportion=2, flag=wx.EXPAND | wx.ALL, border=3)
text_box.Add(srch_button, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
# 搜索之后的显示文本框
srch_content = wx.TextCtrl(panel, style=wx.TE_MULTILINE|wx.HSCROLL)
# 点击开始爬取按钮
crawl_button = wx.Button(panel, label="开始爬取猫眼电影TOP100")
# 绑定爬取事件,事件函数有且只有一个参数,叫event
crawl_button.Bind(wx.EVT_BUTTON, main)
v_box = wx.BoxSizer(wx.VERTICAL)
v_box.Add(lb_box, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
v_box.Add(text_box, proportion=1, flag=wx.EXPAND | wx.ALL, border=3)
v_box.Add(srch_content, proportion=10, flag=wx.EXPAND | wx.ALL, border=3)
v_box.Add(crawl_button, proportion=1, flag=wx.ALIGN_CENTER_HORIZONTAL | wx.ALL, border=3)
panel.SetSizer(v_box)
frame.Show()
app.MainLoop()
print('Total time:', t1-time.time())
点击搜索button, 获取输入文本框和下拉菜单值,进行数据库查询操作
# 点击搜索button事件
def hit_me(event):
content = content_text.GetValue()
type = type_combox.GetValue()
if not (content and type): # 若输入框或下拉框为空则显示错误信息
wx.MessageBox("please input some values", "Message", wx.OK | wx.ICON_INFORMATION)
return
if type == "演员名字":
field = 'actor'
else:
field = 'name'
value = '%'+ content + '%'
# Mysql 数据库查询操作
# 打开数据库连接
db = pymysql.connect(host="localhost", user='root', password='Lizzie94', port=3306, db='Movies_mao')
cursor = db.cursor()
try:
cursor.execute("Select * from movie where {field} like '{value}' ".format(field=field, value=value))
results = cursor.fetchall()
all_row = ''
for each in results:
row = ' '.join(str(i) for i in each)
all_row += row + '\n'
srch_content.SetValue(all_row)
except Exception as e:
wx.MessageBox("selection from database error" , "Message", wx.OK | wx.ICON_INFORMATION)
print(e)
db.close()
最后的效果图如下:
未完待续,下次需要用异步方法来爬虫,本人基础不是很扎实,只能用同步方法来爬取数据。
最后附上 wxpyhon的中文学习资料: https://www.ctolib.com/docs/sfile/wxpy-in-action/12.html
英文学习地址:http://zetcode.com/wxpython/
https://www.wxpython.org/