查看上一个项目,请看:https://blog.csdn.net/az9996/article/details/85094411
上一个项目中获取到了一定数量的电影url信息,这次来获取单个电影的电影详情。
#对传递的url返回一个名为soup的Beautifulsoup对象
def get_url_html_soup(url):
header=request_body.get_header()
proxies=request_body.get_proxy()
req=requests.get(url=url,proxies=proxies,headers=header)
html=req.text
if req.status_code==200:
print("请求成功!")
soup=bs4.BeautifulSoup(html,'lxml')
return soup
if req.status_code!=200:
print("请求失败!")
#因为url中有电影的id,所以这里就用正则表达式来获取url中的数字。
def get_movie_id(url):
movie_id = re.compile(r'\d')
f = movie_id.findall(url)
k = file_operation.segmented(f)
id = k.replace(' ', '')
return id
#将soup作为传递参数,从中获取相应的信息
# 返回影片标题
def get_movie_title(soup):
movie_name = soup.find('h1').text # 获取电影标题
return movie_name
# 返回影片的导演名
def get_movie_directors(soup):
contents = soup.find('div', id='info')
# 构造正则表达式
directors = re.compile(r'导演:(.*)')
# 在contents的文本内容中寻找与正则表达式匹配的内容(编译运行正则表达式)
f = directors.findall(contents.text)[0]
lists = str.split(f)
director_name = file_operation.segmented(lists)
return director_name
# 返回影片的编剧名
def get_movie_screenwriter(soup):
contents = soup.find('div', id='info')
screenwriter = re.compile(r'编剧:(.*)')
f = screenwriter.findall(contents.text)[0]
lists = str.split(f)
screenwriter_name = file_operation.segmented(lists)
return screenwriter_name
# 返回影片的主演名
def get_movie_character(soup):
contents = soup.find('div', id='info')
character = re.compile(r'主演:(.*)')
f = character.findall(contents.text)[0]
lists = str.split(f)
characters_name = file_operation.segmented(lists)
return characters_name
# 返回影片的类型
def get_movie_type(soup):
contents = soup.find('div', id='info')
type = re.compile(r'类型:(.*)')
f = type.findall(contents.text)[0]
lists = str.split(f)
type_name = file_operation.segmented(lists)
return type_name
# 返回影片的制片国家/地区
def get_movie_country(soup):
contents = soup.find('div', id='info')
pattern = re.compile('制片国家/地区:(.*)')
f = pattern.findall(contents.text)[0]
lists = str.split(f)
country = file_operation.segmented(lists)
return country
需要将爬取的信息放入MySQL数据库的话请看:https://blog.csdn.net/az9996/article/details/85094917