基础页面:https://movie.douban.com/top250
代码:
from time import sleep
from requests import get
from bs4 import BeautifulSoup
import re
import pymysql
db = pymysql.connect(host='localhost',
user='root',
password='123456',
db='douban',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
try:
with db.cursor() as cursor:
sql = "CREATE TABLE IF NOT EXISTS `top250` (" \
"`id` int(6) NOT NULL AUTO_INCREMENT," \
"`top` int(6) NOT NULL," \
"`page-code` int(6) NOT NULL," \
"`title` varchar(255) NOT NULL," \
"`origin-title` varchar(255)," \
"`score` float NOT NULL," \
"`theme` varchar(255) NOT NULL," \
"PRIMARY KEY(`id`)" \
") ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;"
cursor.execute(sql,)
finally:
db.commit()
base_url = 'https://movie.douban.com/top250'
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'xxx',
'Host': 'movie.douban.com',
'Referer': 'https://movie.douban.com/chart',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'xxx'
}
def crawler(url=None, headers=None, delay=1):
r = get(url=url, headers=headers, timeout=3)
soup = BeautifulSoup(r.text, 'html.parser')
page_tag = soup.find('span', attrs={'class': 'thispage'})
page_code = re.compile(r'(.*)').findall(str(page_tag))[0]
movie_ranks = soup.find_all('em', attrs={'class': ''})
movie_titles = soup.find_all('div', attrs={'class': 'hd'})
movie_scores = soup.find_all('span', attrs={'class': 'rating_num'})
movie_themes = soup.find_all('span', attrs={'class': 'inq'})
next_page = soup.find('link', attrs={'rel': 'next'})
for ranks, titles, scores, themes in zip(movie_ranks, movie_titles, movie_scores, movie_themes):
rank = re.compile(r'(.*)').findall(str(ranks))
regex_ts = re.compile(r'(.*)').findall(str(titles))
title = regex_ts[0]
score = re.compile(r'
结果:
mysql> select top,title,score from top250 where id = 175; +-----+--------+-------+ | top | title | score | +-----+--------+-------+ | 176 | 罗生门 | 8.7 | +-----+--------+-------+ 1 row in set (0.00 sec) mysql> select top,title,page-code,score from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> select top,page-code,title,score from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> select page-code from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> describe top250 -> ; +--------------+--------------+------+-----+---------+----------------+ | Field | Type | Null | Key | Default | Extra | +--------------+--------------+------+-----+---------+----------------+ | id | int(6) | NO | PRI | NULL | auto_increment | | top | int(6) | NO | | NULL | | | page-code | int(6) | NO | | NULL | | | title | varchar(255) | NO | | NULL | | | origin-title | varchar(255) | YES | | NULL | | | score | float | NO | | NULL | | | theme | varchar(255) | NO | | NULL | | +--------------+--------------+------+-----+---------+----------------+ 7 rows in set (0.32 sec) mysql> select page-code from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> select origin-title from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'origin' in 'field list' mysql> select origin_title from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'origin_title' in 'field list' mysql> select * from top250 where id = 175; +-----+-----+-----------+--------+--------------+-------+-------------------+ | id | top | page-code | title | origin-title | score | theme | +-----+-----+-----------+--------+--------------+-------+-------------------+ | 175 | 176 | 8 | 罗生门 | 羅生門 | 8.7 | 人生的N种可能性。 | +-----+-----+-----------+--------+--------------+-------+-------------------+ 1 row in set (0.00 sec) mysql> select * from top250 where title = 未麻的部屋; ERROR 1054 (42S22): Unknown column '未麻的部屋' in 'where clause' mysql> select * from top250 where top=175; Empty set (0.00 sec) mysql>
两个小问题:
1.没想到数据库字段不能用'-'...,于是page-code字段与origin-title字段不能独立进行查找。。。
2.不知道为啥top175的电影《未麻的部屋》没爬到。。。
建议使用scrapy。
用scrapy的一些好处是配置爬虫很方便,还有其内部自带的html解析器、对不完整的url的组建等十分便利。
最后,吐槽一下,之前的电脑配置太差,跑深度学习程序的过程耗尽内存,出现莫名的bug后,蓝屏死机就再也没法启动了。。。所以,暂时不能更新博客了。。。