python 简单爬取segmentFault 前100页数据进行分析关于哪个技术栈提问最多

使用 urllib 和 BeautifulSoup pymysql

抓取页代码

   from urllib.request import Request, urlopen
   from bs4 import BeautifulSoup
   import re
   import pymysql
   datas = [] 
     def getHtml(url):
    page = urlopen(url)
    if page.getcode() != 200:
        return None;
    html = page.read().decode(encoding='utf-8')
    return html
def parser(html_doc):
     soup = BeautifulSoup(html_doc,'html.parser')
     summs = soup.findAll('div',class_="summary")
     #res_data 不能放在循环外面 
     #当声明一个字典 info = {} 的操作时候,该字典就已经在内存中获取了某一块地址。
    #对该字典进行操作时,如 info['name'] = 'github' 的时候,这个字典依旧是之前所占用的地址。
     config = {
        'host':'127.0.0.1',
        'port':3306,
        'user':'root',
        'password':'root',
        'db':'python_test',
        'charset':'UTF8',
        'cursorclass':pymysql.cursors.DictCursor,
     }
    #因为上面的是一个字典,所以传进来需要 **

     for summ in summs:
        res_data = {}
        res_data['title'] = summ.find('h2',class_="title").find('a').get_text()
        tags =  summ.findAll('li',class_="tagPopup")
        tags_tag = set()
        for tag in tags:
            tags_tag.add(tag.find('a',class_='tag').get_text())
        res_data['tags']   = tags_tag
        datas.append( res_data )
        const = pymysql.Connect(**config)
        try:
            cursor = const.cursor()
            sql = 'insert into segment(title,tags) values(%s,%s)'
            cursor.execute(sql,(res_data['title'],str(res_data['tags'])))
            #没有设置默认自动提交,需要主动提交,以保存所执行的语句
            const.commit()
        except Exception as e:
            print(e)
        finally:
            cursor.close()
            const.close()

if __name__ == '__main__':
    url = 'https://segmentfault.com/t/javascript?type=newest&page='
    count = 1
    while count < 100:
        new_url = url+str(count)
        html_dom = getHtml(new_url )
        print('正在执行第'+str(count)+'页的内容抓取')
        parser(html_dom)
        count = count + 1
    print('程序执行完毕') 

统计页代码

import pymysql
config = {
        'host':'127.0.0.1',
        'port':3306,
        'user':'root',
        'password':'root',
        'db':'python_test',
        'charset':'UTF8',
        'cursorclass':pymysql.cursors.DictCursor,
}

def get_tags():
    connect = pymysql.connect(**config)
    try:
        cursor = connect.cursor()
        sql = "select tags from segment"
        cursor.execute(sql)
        result = cursor.fetchall()
        fout = open('output1.html','w')
        fout.write('')
        fout.write('')
        fout.write('')
        fout.write('')
        fout.write('')
        fout.write('')
        fout.write('')
        tags_box = {}
        for tag in result:
            for item in eval(tag['tags']):
                count = 0;
                if item in tags_box:
                   tags_box[item] = tags_box[item] + 1
                else:
                    tags_box[item] = count + 1 
        new_tag = sorted(tags_box.items(),key = lambda x:x[1],reverse = True)
        for item in new_tag:
            fout.write('')
            for i in item:
                fout.write("" % i)
            fout.write('')
        fout.write('
tag统计
%s
') fout.write('') fout.write('') fout.close() except Exception as e: print( e ) finally: connect.close() if __name__ == '__main__': get_tags()

你可能感兴趣的:(python 简单爬取segmentFault 前100页数据进行分析关于哪个技术栈提问最多)