2020-05-20

import requests,re#requests模块抓取网页,re提取字符
from pyquery import PyQuery as pq#pyquery分析HTML
import pandas as pd
from sqlalchemy import create_engine#pandas配合sqlalchemy将形成的DataFrame输入数据库,以便后期可视化分析


BHCheaders = {
    'authority': 'www.bhc520.me',
    'method': 'GET',
    'path': '/forum.php?mod=forumdisplay&fid=90&page=1',
    'scheme': 'https',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    'cookie': 'PoU4_2132_saltkey=V42zh2qe; PoU4_2132_lastvisit=1588779238; UM_distinctid=171ead63b661ba-0c2fc04eaf8fbf-38710758-1fa400-171ead63b67430; CNZZDATA1273976237=1387143177-1588780707-%7C1588780707; PoU4_2132_ulastactivity=60340GMBoqWqmtWzkffhOcGrbnHg0VHaw6HO%2BRzCTRHpLVW0l4Bc; PoU4_2132_auth=0e9dfectx9uUJmvyOKNBdKxgVgJgz%2BiGuAtiqBfY%2BEwEvzU0ZQtoH%2BFhqsV9jPjjhmU17xyujaDYPDxIFK4b9OYZPhM; PoU4_2132_lastcheckfeed=358046%7C1588782875; PoU4_2132_sid=aiKiUN; PoU4_2132_lip=114.84.243.166%2C1588782875; PoU4_2132_nofavfid=1; PoU4_2132_noticeTitle=1; PoU4_2132_atarget=1; PoU4_2132_visitedfid=90; PoU4_2132_st_p=358046%7C1588784702%7Cd73a2b57d834792b86a11a1cd8773d3f; PoU4_2132_viewid=tid_11230; PoU4_2132_lastact=1588784890%09forum.php%09forumdisplay; PoU4_2132_st_t=358046%7C1588784890%7Cb59704da1fa4d9a40434310a8b897d87; PoU4_2132_forum_lastvisit=D_90_1588784890',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}#requests请求headers
Data = {
    'formhash': '1fb1a8d9',
    'referer': 'https%3A%2F%2Fwww.bhc520.me%2Fportal.php',
    'loginfield': 'username',
    'username': 'Username',
    'password': 'password',
    'questionid': '0',
    'answer': ''
}#requests请求Data,将USERNAME、PASSWORD换成你们的帐号密码
Sess = requests.session()#引入Session保持会话
engine = create_engine('mysql+pymysql://Username:Userpassword#@Ipaddress:Port/Databasename')#创建数据库引擎,账号密码数据库换成你们自己的数据库配置
res = Sess.post('https://www.bhc520.me/portal.php',headers=BHCheaders,data=Data)#登陆
ShangHaiJiaLi = pd.DataFrame()#创建空Datarame
if res.status_code == 200:#如果服务器回应正常,则开始遍历
    for PageCount in range(1,23):#23是HeatUrl的页码+1
        HeatUrl = 'https://www.bhc520.me/forum.php?mod=forumdisplay&fid=2&filter=&orderby=dateline&&page=%s' % PageCount
        #          https://www.bhc520.me/forum.php?mod=forumdisplay&fid=2&filter=&orderby=dateline&&page=#广州
        #          https://www.bhc520.me/forum.php?mod=forumdisplay&fid=90&page=#上海
        Html = Sess.get(HeatUrl,headers=BHCheaders)
        for i in range(1,20):
            try:
                doc = pq(Html.text)
                items = doc('#waterfall > li:nth-child(%s) > div.deanddimg > a'%i)
                imgs = doc('#waterfall > li:nth-child(%s) > div.deanddimg > a > img'%i)
                PersonalTitle = items.attr('title')
                PersonalUrlLink = 'https://www.bhc520.me/'+str(items.attr('href'))
                PersonalImagsUrl = 'https://www.bhc520.me/'+str(imgs.attr('src'))
                # print(PersonalImagsUrl,PersonalUrlLink)
                PersonalUrlLink_Content = Sess.get(PersonalUrlLink,headers=BHCheaders)
                PersonalText = re.compile(r'.*联系地址:(.*?)联系方式:.*').findall(pq(PersonalUrlLink_Content.text).find('.t_f').text())
                PersonalTel = re.compile(r'.*联系方式:(.*?)联系我时请说明来自百花丛.*').findall(pq(PersonalUrlLink_Content.text).find('.t_f').text())
                if len(str(PersonalText)) < 4:
                    PersonalText = "None"
                if len(str(PersonalTel)) <4 :
                    PersonalTel = "None"
                PersonalInfo = {'Title':PersonalTitle,'UrlLink':PersonalUrlLink,'ImagsUrl':PersonalImagsUrl,'PersonalText':PersonalText,'PersonalTel':PersonalTel}
                # print(PersonalInfo)
                TempDataFrame = pd.DataFrame(data=PersonalInfo, index=[0])
                ShangHaiJiaLi = pd.concat([ShangHaiJiaLi,TempDataFrame])
            except ValueError as e:
                print(e)
                pass
    ShangHaiJiaLi.to_sql("ShangHaiJiaLi", engine, index=False, if_exists='replace')
else:
    print(res.status_code)

你可能感兴趣的:(python)