import requests,re
from pyquery import PyQuery as pq
import pandas as pd
from sqlalchemy import create_engine
BHCheaders = {
'authority': 'www.bhc520.me',
'method': 'GET',
'path': '/forum.php?mod=forumdisplay&fid=90&page=1',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'PoU4_2132_saltkey=V42zh2qe; PoU4_2132_lastvisit=1588779238; UM_distinctid=171ead63b661ba-0c2fc04eaf8fbf-38710758-1fa400-171ead63b67430; CNZZDATA1273976237=1387143177-1588780707-%7C1588780707; PoU4_2132_ulastactivity=60340GMBoqWqmtWzkffhOcGrbnHg0VHaw6HO%2BRzCTRHpLVW0l4Bc; PoU4_2132_auth=0e9dfectx9uUJmvyOKNBdKxgVgJgz%2BiGuAtiqBfY%2BEwEvzU0ZQtoH%2BFhqsV9jPjjhmU17xyujaDYPDxIFK4b9OYZPhM; PoU4_2132_lastcheckfeed=358046%7C1588782875; PoU4_2132_sid=aiKiUN; PoU4_2132_lip=114.84.243.166%2C1588782875; PoU4_2132_nofavfid=1; PoU4_2132_noticeTitle=1; PoU4_2132_atarget=1; PoU4_2132_visitedfid=90; PoU4_2132_st_p=358046%7C1588784702%7Cd73a2b57d834792b86a11a1cd8773d3f; PoU4_2132_viewid=tid_11230; PoU4_2132_lastact=1588784890%09forum.php%09forumdisplay; PoU4_2132_st_t=358046%7C1588784890%7Cb59704da1fa4d9a40434310a8b897d87; PoU4_2132_forum_lastvisit=D_90_1588784890',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
Data = {
'formhash': '1fb1a8d9',
'referer': 'https%3A%2F%2Fwww.bhc520.me%2Fportal.php',
'loginfield': 'username',
'username': 'Username',
'password': 'password',
'questionid': '0',
'answer': ''
}
Sess = requests.session()
engine = create_engine('mysql+pymysql://Username:Userpassword#@Ipaddress:Port/Databasename')
res = Sess.post('https://www.bhc520.me/portal.php',headers=BHCheaders,data=Data)
ShangHaiJiaLi = pd.DataFrame()
if res.status_code == 200:
for PageCount in range(1,23):
HeatUrl = 'https://www.bhc520.me/forum.php?mod=forumdisplay&fid=2&filter=&orderby=dateline&&page=%s' % PageCount
Html = Sess.get(HeatUrl,headers=BHCheaders)
for i in range(1,20):
try:
doc = pq(Html.text)
items = doc('#waterfall > li:nth-child(%s) > div.deanddimg > a'%i)
imgs = doc('#waterfall > li:nth-child(%s) > div.deanddimg > a > img'%i)
PersonalTitle = items.attr('title')
PersonalUrlLink = 'https://www.bhc520.me/'+str(items.attr('href'))
PersonalImagsUrl = 'https://www.bhc520.me/'+str(imgs.attr('src'))
PersonalUrlLink_Content = Sess.get(PersonalUrlLink,headers=BHCheaders)
PersonalText = re.compile(r'.*联系地址:(.*?)联系方式:.*').findall(pq(PersonalUrlLink_Content.text).find('.t_f').text())
PersonalTel = re.compile(r'.*联系方式:(.*?)联系我时请说明来自百花丛.*').findall(pq(PersonalUrlLink_Content.text).find('.t_f').text())
if len(str(PersonalText)) < 4:
PersonalText = "None"
if len(str(PersonalTel)) <4 :
PersonalTel = "None"
PersonalInfo = {'Title':PersonalTitle,'UrlLink':PersonalUrlLink,'ImagsUrl':PersonalImagsUrl,'PersonalText':PersonalText,'PersonalTel':PersonalTel}
TempDataFrame = pd.DataFrame(data=PersonalInfo, index=[0])
ShangHaiJiaLi = pd.concat([ShangHaiJiaLi,TempDataFrame])
except ValueError as e:
print(e)
pass
ShangHaiJiaLi.to_sql("ShangHaiJiaLi", engine, index=False, if_exists='replace')
else:
print(res.status_code)