import requests
import re
from lxml import etree
import time
import fake_useragent
def str_qukong_quhuanhang(con):
return ''.join((''.join(con)).split())
#获取每页的响应
page_number = 1
ua=fake_useragent.UserAgent()
ua=ua.data_randomize
print(ua)
def post_res():
global page_number
url = 'http://scjgj.beijing.gov.cn/qyxy/newChange/newChangeAction!gsmd_list.dhtml?clear=true&flag_num=2'
page = str(page_number)
data = {
'entNameAddress': '',
'issue_date': '',
'vchr_bmmc': '',
'before_year_3': '2016 - 04 - 15',
'issue_org_code': '',
'deliv_date': '',
'qy_ent_name': '',
'qy_reg_no': '',
'blsfxx': '',
'num': '',
'party_kind': '',
'SelectPageSize': 50,
'EntryPageNo': 1,
'pageNo': page,
'pageSize': 50,
'clear': 'true',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'UM_distinctid=16a1f99b10cac-00b59cac3ead8a-1a201708-1fa400-16a1f99b10dcbc; JSESSIONID=PJ82c05BknCGC3Qv8VFKGRJ4b22Vmvq1hJb4JhfmW7pvJjp5VDlN!-377951661; CNZZDATA1257386840=1902084243-1555303935-http%253A%252F%252Fscjgj.beijing.gov.cn%252F%7C1555314735',
'Host': 'scjgj.beijing.gov.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
html1 = requests.post(url, headers=headers, data=data)
txt = html1.text
html = etree.HTML(txt)
if page_number < 6755:
page_number += 1
else:
print('爬区完毕')
time.sleep(99999)
# with open('../page%s.html'%page,'wb') as f:
# f.write(html1.content)
return html
#get获取每条信息的res
def get_res(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'UM_distinctid=16a1f99b10cac-00b59cac3ead8a-1a201708-1fa400-16a1f99b10dcbc; JSESSIONID=PJ82c05BknCGC3Qv8VFKGRJ4b22Vmvq1hJb4JhfmW7pvJjp5VDlN!-377951661; CNZZDATA1257386840=1902084243-1555303935-http%253A%252F%252Fscjgj.beijing.gov.cn%252F%7C1555314735',
'Host': 'scjgj.beijing.gov.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
html1 = requests.get(url, headers=headers)
txt = html1.text
html = etree.HTML(txt)
# with open('../大致信息.html','wb') as f:
# f.write(html1.content)
return html
# 提取每页数据中的url/内容,返回50个url的列表
num1 = 1
def tiqu_page_in_url(res):
global num1
ls = res.xpath('//*[@id="creditForm"]/table')
print(len(ls))
urllist = []
for i in ls:
url_q = 'http://scjgj.beijing.gov.cn/'
url_h = ''.join(re.findall("/(.*?)'", i.xpath('./tr[1]/td[2]/span/@onclick')[0]))
url = url_q + url_h
urllist.append(url)
gs_m = i.xpath('./tr[1]/td[2]/span/font/text()')[0]
gs_m =str_qukong_quhuanhang(gs_m)
page_1 = i.xpath('./tr[2]/td[2]/text()')[0]
page_1=str_qukong_quhuanhang(page_1)
page_2 = i.xpath('./tr[2]/td[3]/text()')[0]
page_2=str_qukong_quhuanhang(page_2)
page_3 = i.xpath('./tr[3]/td[2]/text()')[0]
page_3=str_qukong_quhuanhang(page_3)
page_4 = i.xpath('./tr[3]/td[3]')[0]
page_4=str_qukong_quhuanhang(page_4)
with open('../ls/信息列表.csv', 'a') as f:
int_str=str(num1)
co = ','.join([int_str, gs_m, url, page_1, page_2, page_3, page_4])
f.write(co+'\n')
num1 += 1
print(urllist)
return urllist
# 提取每条信息的数据,详情url
num2=1
def tiqu_dazhi_in_url(res):
global num2
ls = res.xpath('//*[id="tableID"]')
for i in ls:
t1 = i.xpath('./tr[1]/text()')[0]
t1=str_qukong_quhuanhang(t1)
t2 = i.xpath('./tr[2]/text()')[0]
t2 = str_qukong_quhuanhang(t2)
t3 = i.xpath('./tr[3]/text()')[0]
t3 = str_qukong_quhuanhang(t3)
t4 = i.xpath('./tr[4]/text()')[0]
t4 = str_qukong_quhuanhang(t4)
t5 = i.xpath('./tr[5]/text()')[0]
t5 = str_qukong_quhuanhang(t5)
url_h = ''.join(re.findall("/(.*?)'",i.xpath('..p/@onclick')[0]))
url = 'http://scjgj.beijing.gov.cn/'+url_h
with open('../ls/大致信息.csv','a') as f:
int_str=str(num2)
co = ','.join([int_str,t1,t2,t3,t4,t5,url])
f.write(co+'\n')
num2+=1
yield url
# 提取每条详情数据
def tiqu_xiangxi_(res):
pass
if __name__ == '__main__':
while True:
page_res = post_res()
page_url_ls = tiqu_page_in_url(page_res)
for page_one in page_url_ls:
dazhi_res = get_res(page_one)
dazhi_in_url=tiqu_dazhi_in_url(dazhi_res)
xiangxi_res=get_res(dazhi_in_url)
tiqu_xiangxi_(xiangxi_res)