python实验报告二

选择一个股票代码(尾数与学号相同),编写爬虫程序,分析百度查询结果变动的情况。程序与分析结果写在实验结果栏。

import bs4
import re
import requests
import json
headers = {
    'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
    'Accept - Encoding':'gzip, deflate',
    'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
    'Connection':'Keep-Alive',
    'Host':'zhannei.baidu.com',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'
}

'''使用requests爬取网页文件'''
def baiduText(code, pn):
    url = 'http://www.baidu.com/s?wd=' + code + '&pn='+str((pn-1)*10)
    try:
        r = requests.get(url, headers=headers,timeout=10)
        r.raise_for_status()
        r.encoding = 'utf-8'  # linux utf-8
        return r.text
    except:
        return "error"

'''使用bs4爬取文件内容'''
def parseSearch(text):
    soup = bs4.BeautifulSoup(text,"html.parser")
    tags = soup.find_all('span')
    for tag in tags:
        cont = tag.string
        if (cont is not None and cont.startswith('百度为您找到相关结果约')):
            cont = cont.lstrip('百度为您找到相关结果约')
            cont = cont.rstrip('个')
            cont = cont.replace(',','')
            writefile('搜索结果为:'+str(cont))
            print('搜索结果为:',cont)

def parseHtml(text,json_data):
    soup = bs4.BeautifulSoup(text, "html.parser")
    tags = soup.find_all('div', class_='result c-container ')
    for tag in tags:
        cont = str(tag .a)
        name = tag.a.text
        name = str(name)
        rule = re.compile(r'ref="(.*?)"')

        if (cont is not None):
            data = rule.search(cont)
            data = data.group()
            data = data.lstrip('ref="',)
            data = data.rstrip('"')
            data = str(data)
            writefile(name+'\n'+data)
            json_data[name] = data

'''文件写入操作'''
def writefile(data,param = True):
    with open('txt.txt', 'a+', encoding="utf-8") as fp:
        fp.write(data)
        fp.write('\n')
        if param == False:
            fp.close()

'''json文件写入'''
def writejson(json_data):
    json_data = json.dumps(json_data, indent=4, ensure_ascii=False)
    with open('txt.json', 'a+', encoding="utf-8") as fp:
        fp.write(json_data)

'''main函数'''
def main():
    json_data = {}
    pn = 1
    while(True):
        text = baiduText('300014', pn)
        if pn == 1:
            parseSearch(text)
        writefile("第"+str(pn)+"页的搜索结果为:")
        parseHtml(text,json_data)
        pn+=1
        if(pn == 100):
            print("爬取结束,请看txt.txt")
            writefile("爬取结束", False)
            break
    writejson(json_data)
main()
import json
def compare():
    with open('txt1.json', 'r', encoding="utf-8") as fp:
        data1 = json.load(fp)
    with open('txt2.json', 'r', encoding="utf-8") as fp:
        data2 = json.load(fp)
    #用于保存数据的临时字典
	dict = {}
	#匹配比较
    for i in list(data1.keys()):
        if i not in data2:
            dict[i] = data1[i]
    for i in list(data2.keys()):
        if i not in data1:
            dict[i] = data2[i]
	#写入操作
    json_data = json.dumps(dict, indent=4, ensure_ascii=False)
    with open('txt.json', 'a+', encoding="utf-8") as fp:
        fp.write(json_data)
        fp.close()
    print("over")
compare()

你可能感兴趣的:(书本答案)