python爬虫采集网络信息

from bs4 import BeautifulSoup
import re
import urllib.parse
import urllib.request
import os
import datetime
import json

# params  CategoryId=808 CategoryType=SiteHome ItemListActionName=PostList PageIndex=3 ParentCategoryId=0 TotalPostCount=4000
def getHtml(url,values):
    user_agent='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
    headers = {'User-Agent':user_agent}
    data = urllib.parse.urlencode(values)
    response_result = urllib.request.urlopen(url+'?'+data).read()
    html = response_result.decode('utf-8')
    return html

#获取数据
def requestCnblogs(index):
    print('请求数据')
    url = 'https://www.csdn.net/'
    value= {
         'CategoryId':808,
         'CategoryType' : 'SiteHome',
         'ItemListActionName' :'PostList',
         'PageIndex' : index,
         'ParentCategoryId' : 0,
        'TotalPostCount' : 4000
    }
    result = getHtml(url,value)
    return result

#解析最外层
def blogParser(index):
  cnblogs = requestCnblogs(index)
  soup = BeautifulSoup(cnblogs, 'html.parser')
  all_div = soup.find_all('div', attrs={'class': 'list_con'}, limit=20)
  blogs = []
  #循环div获取详细信息
  for item in all_div:
      blog = analyzeBlog(item)
      blogs.append(blog)

  return blogs

#解析每一条数据
def analyzeBlog(item):
    result = {}
    a_title = find_all(item,'div','title')[0].find_all('a')[0]
    if a_title is not None:
        # 博客标题
        result["title"] = a_title.string.replace("\n","").strip()
        # 博客链接
        result["href"] = a_title['href']
    p_summary = find_all(item,'div','summary oneline')
    if p_summary is not None:
        # 简介
        result["summary"] = p_summary[0].text.replace("\n","").replace("\n","").strip()

    footers = find_all(item,'dl','list_userbar')[0]
    author = find_all(footers,'dd','name')[0]
    # 作者
    result["author"] = author.find_all('a')[0].string.replace("\n","").strip()
    # 作者url
    result["author_url"] = author.find_all('a')[0]['href']
    time = find_all(footers,'dd','time')[0].text
    result["create_time"] = time.replace("\n","").strip()

    comment_str = find_all(footers,'dd','read_num')[0].find_all('span')[0].text
    result["comment_num"] = comment_str

    view_str = find_all(footers,'dd','common_num ')[0].find_all('span')[0].string
    result["view_num"] = view_str

    return result

def find_all(item,attr,c):
    return item.find_all(attr,attrs={'class':c},limit=1)

def writeToTxt(list_name,file_path):
    try:
        #这里直接write item 即可,不要自己给序列化在写入,会导致json格式不正确的问题
        fp = open(file_path,"w+",encoding='utf-8')
        print(file_path)
        l = len(list_name)
        i = 0
        fp.write('[')
        for item in list_name:
            fp.write(str(item))
            if i1:
                fp.write(',\n')
            i += 1
        fp.write(']')
        fp.close()
    except IOError:
        print("fail to open file")


def saveBlogs():
    for i in range(1,2):
        print('request for '+str(i)+'...')
        blogs = blogParser(1)
        #保存到文件
        path = createFile()
        writeToTxt(blogs,path+'/blog_'+ str(i) +'.json')
        print('第'+ str(i) +'页已经完成')
    return 'success'

def createFile():
    date = datetime.datetime.now().strftime('%Y-%m-%d')
    path = 'F:/Blog/'+date
    if os.path.exists(path):
        return path
    else:
        os.mkdir(path)
        return path

if __name__ == '__main__':
    result = blogParser(1)
    print(result)
    # print("python爬取CSDN:")
    # info = saveBlogs()
    # print(info)

你可能感兴趣的:(代码)