右键选择查看网页源代码,找到相应的job列表位置,但是结果是崩溃的,,这是什么鬼东西,即使能爬下来,但是我们看不懂,于是我换了一种思路。
import requests
import re
import pymongo
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36'}
MongoUrl= 'localhost'
MongoDB = 'Shixiceng'
MongoTable = 'shixiceng'
def Gethtml(url):
try:
html = requests.get(url,headers=headers)
if html.status_code == 200:
return html.text
except Exception as e:
print(e,"1")
def parse_html(html):
job_url = []
pattern = r'.*?'
result = re.findall(pattern,html,re.S)
Url = "http://www.shixiseng.com"
for i in result:
parse_url = Url + i
job_url.append(parse_url)
return job_url
def Get_jobInfo(url):
try:
html = requests.get(url, headers=headers)
if html.status_code == 200:
reg = r'(.*?)'
job_name = re.findall(reg, html.text)[0]
reg = r'(.*?)'
job_city = re.findall(reg, html.text)[0]
reg = r'(.*?)'
job_limit = re.findall(reg,html.text)[0]
return {
'job_name' : job_name,
'job_city' : job_city,
'job_limit' : job_limit
}
except Exception as e:
print(e,"2")
def Save_Mongo(result):
client = pymongo.MongoClient(MongoUrl,connect=False)
db = client[MongoDB]
if db[MongoTable].insert(result):
print("数据存储成功")
def main():
for page in range(1,20):
try:
url = 'http://www.shixiseng.com/interns?k=Python&t=zj&p={}'.format(page)
html = Gethtml(url)
job_url = parse_html(html)
for i in job_url:
Result = Get_jobInfo(i)
Save_Mongo(Result)
except Exception as e:
print(e,"3")
if __name__ == '__main__':
main()
最后,由于时间不加紧促,还有一些功能没有写进去,希望你们原谅,另外知道有哪位大神如何解决页面的字符问题的可以给我留言,谢谢。