[python][project][爬虫] 时光网抓取信息


本文主要总结了爬取时光网的方法,仅供学习爬虫使用:


1.爬取首页热点资讯和新闻图片:

url = "http://www.mtime.com/"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
#获取热门资讯
news_list = soup.find_all('div', attrs={'class': 'newsitem'})
for news in news_list:
    title=news.find('a')['title']
    content=news.find('a')['href']
     #将热点资讯放在字典中,key值为资讯标题,value为资讯链接
    news_dic[title]=content
#获取热点图片
hotpicture_list = soup.find_all('div', attrs={'class': 'over-a'})
for hotpicture in hotpicture_list:
    picture_url = re.findall("(?<=[(])[^()]+\.[^()]+(?=[)])", hotpicture['style'])[0].replace(' ', '')
    urllib.urlretrieve(picture_url, picture_path)

2.爬取搜索到的电影资讯

#quote对字符串进行编码
searchcontent=quote(searchcontent)
url_search='http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fsearch.mtime.com%2Fsearch%2F'+quote('/?q='+searchcontent+'&t=1&i=0&c=791')+'&Ajax_CallBackArgument0='+searchcontent+'&Ajax_CallBackArgument1=1&Ajax_CallBackArgument2=791&Ajax_CallBackArgument3=0&Ajax_CallBackArgument4=1'
moviesearch_result = requests.get(url_search).content
movie_list = re.findall(r'{"movieId":(.*?)}', moviesearch_result)
for movie in movie_list:
    movie_title=re.findall(r'"movieTitle":"(.*?)",', movie)[0]
    movie_url = re.findall(r'"movieUrl":"(.*?)",', movie)[0]
    #将搜索内容放在字典中,key值为电影名称,value为电影链接
    movie_dic[movie_title] = movie_url


3.爬取搜索到的影人信息

searchcontent = quote(searchcontent)
url_search='http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fsearch.mtime.com%2Fsearch%2F'+quote('?q='+searchcontent+'&t=3&i=0&c=791')+'&Ajax_CallBackArgument0='+searchcontent+'&Ajax_CallBackArgument1=3&Ajax_CallBackArgument2=791&Ajax_CallBackArgument3=0&Ajax_CallBackArgument4=1'
moviersearch_result = requests.get(url_search).content
movirer_list = re.findall(r'{"personId":(.*?)}]', moviersearch_result)[0]
#影人照片
movierpicture_url = re.findall(r'"cover":"(.*?)",', movirer_list)[0]
urllib.urlretrieve(movierpicture_url, movierpicture_path)
#影人姓名
moviertitle=re.findall(r'"personTitle":"(.*?)",', movirer_list)[0]
#影人职业
movierwork=re.findall(r'"personFilmography":"(.*?)",', movirer_list)[0]
#影人生日
movierbirth=re.findall(r'"birth":"(.*?)",', movirer_list)[0]
#影人喜爱度
movierlove=re.findall(r'"love":(.*?),', movirer_list)[0]
#影人代表作
moviermovielist = re.findall(r'"title":"(.*?)",', movirer_list)
movielist=""
for m in moviermovielist:  
    movielist=movielist+"《"+m+"》"
#更多信息(链接)
url_movier=re.findall(r'"personUrl":"(.*?)",', movirer_list)[0]

4.根据用户ID爬取用户所有影评

review_dict = dict()
point_dict = dict()
review_No = 1
point_No = 1
page_num = 1
url_user='http://sandbox.my.mtime.com/Service/callback.mc?Ajax_CallBack=true&Ajax_CallBackType=Mtime.MemberCenter.Pages.CallbackService&Ajax_CallBackMethod=RemoteLoad&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fmy.mtime.com%2F'+ str(userId) + '%2F&Ajax_CallBackArgument0=t&Ajax_CallBackArgument1=' + str(userId) + '%2F%3F%242'
while True:
    cont = requests.get(url_user).content
    cont_list = re.findall(r'content:"\s+(.+?)', cont)[0]
    txt = re.sub(r'\\', '', cont_list)
    soup = BeautifulSoup(txt, 'html.parser')
    # 用户信息
    if page_num == 1:
        member_info_name = soup.find('div', attrs={'class': 't_memberinfo'}).find('a')['title']
        member_info = member_info_name + u'说'
    # 用户评论和评分
    Content_list = soup.find_all('div', attrs={'class': 't_module'})
    for content in Content_list:
        review = content.find('dt', attrs={'class': 'normal'})
        point = content.find('strong', attrs={'class': 'c_green fl'})
        if review:
            ## print 'This is a review'
            try:
                review_movie = content.find('dd', attrs={'class': 'clearfix mt9 tl_link'}).get_text()
            except AttributeError:
                pass
            review_content = review.get_text()
            review_content = re.sub(member_info, '', review_content)
            review_time = content.find('span', attrs={'class': 'mt3 fl'}).find('a').get_text()
              #将评论信息放置到字典中,字典key值为序号,value为list[评论电影,评论内容,评论时间]
            review_detail = [review_movie, review_content, review_time]
            review_dict[review_No] = review_detail
            review_No = review_No + 1
        elif point:
            # print 'This is a point'
            if content.find('div', attrs={'class': 'clearfix mt9 px14 tl_link lh16'}):
                point_movie = content.find('div', attrs={'class': 'clearfix mt9 px14 tl_link lh16'}).get_text()
                point_content = point.get_text()
                point_time = content.find('span', attrs={'class': 'mt3 fl'}).find('a').get_text()
                   #将评分信息放置到字典中,字典key值为序号,value为list[评分电影,评分内容,评分时间]
                point_detail = [point_movie, point_content, point_time]
                point_dict[point_No] = point_detail
                point_No = point_No + 1
    page_list = soup.find('div', attrs={'class': 'my_page'})
     #获取下一页链接
    next_page = page_list.find('a', attrs={'class': 'ml10 next'})
    if next_page:
        if page_num == 1:
            max_page = page_list.find_all('a', attrs={'class': 'num'})
        page_num = page_num + 1
        url_user = 'http://sandbox.my.mtime.com/Service/callback.mc?Ajax_CallBack=true&Ajax_CallBackType=Mtime.MemberCenter.Pages.CallbackService&Ajax_CallBackMethod=RemoteLoad&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fmy.mtime.com%2F' + str(userId) + '%2F%3Ffilter%3D0%26pageIndex%3D' + str(page_num) + '&Ajax_CallBackArgument0=t&Ajax_CallBackArgument1=' + str(userId) + '%2F%3Ffilter%3D0%26pageIndex%3D' + str(page_num)

    else:
        break


5.模拟登陆并发布影评

#构造post方法传递参数
login_postdata={
    'loginEmailText': 'username',#这里的用户名不用加密,可以直接写进来
    'loginPasswordText': '',
    'inputVcode': '',
    'isvcode': 'true',
    'isAutoSign': 'true'
}

#对密码进行加密
login_postdata['loginPasswordText']=self.md5(password)
url_login='https://passport.mtime.com/member/signinLogin'
s = requests.Session()
login_result=json.loads(s.post(url_login, data=login_postdata).content)
if login_result['result']['code']==91880012:
    self.label_inform.setText(u'用户名无效')
elif login_result['result']['code']==91880013:
    self.label_inform.setText(u'密码错误')
elif login_result['result']['code']==0:
    self.label_inform.setText(u'登录成功!')


#登录成功后,发布微评,content为发布内容
url_send='http://service.mtime.com/Service/Twitter.msi?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Service.Pages.TwitterService&Ajax_CallBackMethod=PostTweetCrossDomainByFlash&Ajax_CrossDomain=1&Ajax_RequestUrl=http://my.mtime.com/&Ajax_CallBackArgument0='+content

s.get(url_send)

#密码加密方法
def md5(self,str):
    m = hashlib.md5()
    m.update(str)
    result=m.hexdigest()
    return m.hexdigest()






你可能感兴趣的:(python)