爬虫(进阶),爬取网页信息并写入json文件

import requests  # python HTTP客户端库,编写爬虫和测试服务器响应数据会用到的类库
import re
import json
from bs4 import BeautifulSoup
import copy
print('正在爬取网页链接……')
List = []
for page in range(8):
    if page == 0:
        url = 'http://usagz.bailitop.com/public/'
    else:
        url = 'http://usagz.bailitop.com/public/' + str(page + 1) + '.html'
    print('-----------正在爬取第' + str(page + 1) + '页------')
    html = requests.get(url)
    html.raise_for_status()
    html.encoding = 'utf-8'
    try:
        soup = BeautifulSoup(html.text, 'html.parser')
        soup = str(soup)
        # 正则表达式找到网页链接
        href = re.compile('http://usagz\.bailitop\.com/public/\d*/\d*\.html')
        URLs = re.findall(href, soup)
        flag = 0
        # 过滤前面重复的3条
        for webUrl in URLs:
            flag = flag + 1
            if flag > 4:
                List.append(webUrl)
                # 每个页面15条数据
    except Exception as e:
        print(e)

print(List)
# 创建字典
data = {'title': '', 'content': '', 'time': ''}
dataList = []

for webSite in List:
    print('\n')
    html = requests.get(webSite)
    html.raise_for_status()
    html.encoding = 'utf-8'
    try:
        soup = BeautifulSoup(html.text, 'html.parser')
        soup = str(soup)
        # 标题
        reg = re.compile('
.*

(.*)

.*
', re.S) title = re.findall(reg, soup) title = title[0] if title.count('img'): # 去前面的标签 title = title.split('>', 1) title = title[1] # 去后面的标签 title = title.split('<', 1) title = title[0] # 日期 reg = re.compile('\d{4}-\d\d-\d\d') date = re.findall(reg, soup) date = date[0] # 正文 reg = re.compile('
(.*)
.*


import requests  # python HTTP客户端库,编写爬虫和测试服务器响应数据会用到的类库
import re
import json
from bs4 import BeautifulSoup
import copy
import urllib.request
import urllib.parse

def downloadPostPage(url, dictdata, headers, charset='utf-8', reqnum=5):
    data = bytes(urllib.parse.urlencode(dictdata), encoding=charset)
    req = urllib.request.Request(url, data, headers=headers, method='POST')
    info = None
    try:
        response = urllib.request.urlopen(req)
        info = response.read().decode(charset)
    except Exception as e:
        # 服务器错误
        print(e)
    return info

if __name__ == '__main__':
    dic = {
        'title': '标题',
        'abstract': '摘要',
        'studentInfo': {
            'study_exp': '最高教育经历',
            'school_type': '院校类型',
            # 成绩分类 托福 小托福 SSAT SLEP
            'grade': []
        },
        'offerInfo': {
            'school': 'value1',
            'degree': 'value2',
            'date': 'value2'
        },
        'paragraphs': [
            {'title': '标题1', 'content': 'content1'},
            {'title': '标题2', 'content': 'content2'},
            {'title': '标题3', 'content': 'content3'},
            {'title': '标题4', 'content': 'content4'},
            {'title': '标题5', 'content': 'content5'}
        ]
    }
    dicList = []
    urlList = []
    url = 'http://case.bailitop.com/cases/yuanxiaoajax.shtml'
    headers = {
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    }
    for jzgd in range(1):
        dictdata = {
            'jzgd': 4, 'type': 2, 'mbcountry': '美国', 'mbdegree': '高中', 'univ': '', 'major': '', 'gpa': '',
            'toefl': '',
            'ielts': ''
        }
        # 请求url , 请求参数dictdata ,   构造的头headers  , 请求次数reqnum
        info = downloadPostPage(url, dictdata, headers=headers, reqnum=1)
        jsonLoads = json.loads(info)[0]
        reg = re.compile('http://case\.bailitop\.com/yuanxiao/\d*\.shtml')
        urlList = urlList + re.findall(reg, jsonLoads)
    print(urlList)
    for webSite in urlList:
        print('\n')
        html = requests.get(webSite)
        html.raise_for_status()
        html.encoding = 'utf-8'
        try:
            data1 = {'type': '', 'value': ''}
            soup = BeautifulSoup(html.text, 'html.parser')

            # 正文
            soupContent = soup.find("div", class_="anli_wenzhang")

            title1 = str(soupContent.p)
            reg = re.compile('【(.*)】', re.S)
            title = re.findall(reg, title1)
            # print(title)
            flag = 1
            content1 = ''
            content2 = ''
            content3 = ''
            content4 = ''
            content5 = ''
            title2 = ''
            title3 = ''
            title4 = ''
            title5 = ''
            for sibling in soupContent.p.next_siblings:
                sibling = str(sibling)
                sibling = sibling.replace('\r', '')
                sibling = sibling.replace('\n', '')
                if sibling.count('【'):
                    flag = flag + 1
                    reg = re.compile('【(.*)】', re.S)
                    if flag == 2:
                        title2 = re.findall(reg, sibling)
                    elif flag == 3:
                        title3 = re.findall(reg, sibling)
                    elif flag == 4:
                        title4 = re.findall(reg, sibling)
                    elif flag == 5:
                        title5 = re.findall(reg, sibling)
                else:
                    if flag == 1:
                        if content1 == '':
                            content1 = content1 + sibling
                        else:
                            content1 = content1 + '\n' + sibling
                    elif flag == 2:
                        if content2 == '':
                            content2 = content2 + sibling
                        else:
                            content2 = content2 + '\n' + sibling
                    elif flag == 3:
                        if content3 == '':
                            content3 = content3 + sibling
                        else:
                            content3 = content3 + '\n' + sibling
                    elif flag == 4:
                        if content4 == '':
                            content4 = content4 + sibling
                        else:
                            content4 = content4 + '\n' + sibling
                    elif flag == 5:
                        if content5 == '':
                            content5 = content5 + sibling
                        else:
                            content5 = content5 + '\n' + sibling
            content1 = content1.replace('

', '') content1 = content1.replace('

', '') content2 = content2.replace('

', '') content2 = content2.replace('

', '') content3 = content3.replace('

', '') content3 = content3.replace('

', '') content4 = content4.replace('

', '') content4 = content4.replace('

', '') content5 = content5.replace('

', '') content5 = content5.replace('

', '') content3 = content3.replace('百利天下', '智课') content4 = content4.replace('百利天下', '智课') content5 = content5.replace('百利天下', '智课') content1 = content1.replace('\u3000', '') content2 = content2.replace('\u3000', '') content3 = content3.replace('\u3000', '') content4 = content4.replace('\u3000', '') content5 = content5.replace('\u3000', '') content5 = content5.replace('\n', '') content5 = content5.replace('
', '') if content5.count('摘要:(.*)

.*', re.S) abstract = re.findall(reg, soup)[0] abstract = abstract.replace('百利天下', '智课') dic['abstract'] = abstract print(abstract) # title 个别未解决,手动删 reg = re.compile('

(.*)

', re.S) title = re.findall(reg, soup)[0] if title.count('(.*)', re.S) title = re.findall(reg, title)[0] title = title.replace('
', '') dic['title'] = title print(title) # offer详情 reg = re.compile( '

录取院校:(.*)

\n

\n

授予学位:(.*)入学时间:(.*?)

\n
\n
', re.S) offerInfo = re.findall(reg, soup)[0] dic['offerInfo']['school'] = offerInfo[0] dic['offerInfo']['degree'] = offerInfo[1] dic['offerInfo']['date'] = offerInfo[2] print(offerInfo) # 学生档案 reg = re.compile( '

最高教育经历:(.*)

\n

院校类型:(.*)

\n

\n

语言成绩:(.*?)

', re.S) studentInfo = re.findall(reg, soup) if len(studentInfo) == 0: # 只有 最高教育经历 院校类型 reg = re.compile( '

最高教育经历:(.*)

\n

院校类型:(.*?)

\n

', re.S) studentInfo = re.findall(reg, soup) if len(studentInfo) == 0: # 只有 院校类型 语言成绩 reg = re.compile( '

院校类型:(.*?)

\n

\n

语言成绩:(.*?)

', re.S) studentInfo = re.findall(reg, soup) studentInfo = studentInfo[0] grade = studentInfo[1] grade = grade.replace(' ', ' ') grade = grade.replace(';', '') grade = grade.replace('  ', ' ') dic['studentInfo']['study_exp'] = '' dic['studentInfo']['school_type'] = studentInfo[0] # dic['studentInfo']['grade'] = grade print('院校类型:', studentInfo[0], '||语言成绩:', grade) reg = re.compile(r' ') gradeList = re.split(reg, grade) for n in range(int((len(gradeList) - 1) / 2)): data1['type'] = gradeList[n * 2] data1['value'] = gradeList[n * 2 + 1] dic['studentInfo']['grade'].append(data1) data1 = copy.deepcopy(data1) else: studentInfo = studentInfo[0] dic['studentInfo']['study_exp'] = studentInfo[0] dic['studentInfo']['school_type'] = studentInfo[1] # dic['studentInfo']['grade'] = '' print('最高教育经历:', studentInfo[0], '||院校类型:', studentInfo[1]) else: studentInfo = studentInfo[0] grade = studentInfo[2] grade = grade.replace(' ', ' ') grade = grade.replace(';', '') grade = grade.replace('  ', ' ') dic['studentInfo']['study_exp'] = studentInfo[0] dic['studentInfo']['school_type'] = studentInfo[1] # dic['studentInfo']['grade'] = grade print('最高教育经历:', studentInfo[0], '||院校类型:', studentInfo[1], '||语言成绩:', grade) reg = re.compile(r' ') gradeList = re.split(reg, grade) for n in range(int((len(gradeList) - 1) / 2)): data1['type'] = gradeList[n * 2] data1['value'] = gradeList[n * 2 + 1] dic['studentInfo']['grade'].append(data1) data1 = copy.copy(data1) dicList.append(dic) dic = copy.deepcopy(dic) dic['studentInfo']['grade'].clear() except Exception as e: print(e) jsonList = json.dumps(dicList, ensure_ascii=False) print(jsonList) # 写入文件 with open("CaseRecord.json", "w", encoding='utf-8') as f: f.write(jsonList) print("加载入文件完成...")

你可能感兴趣的:(爬虫(进阶),爬取网页信息并写入json文件)