Python前几篇代码汇总

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import json
#获取评论数的函数
def getComment(newsurl):
    m = re.search('doc-i(.+).shtml',newsurl)
    newsid = m.group(1)
    commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20&jsvar=loader_1500520949298_3273313'
    comments = requests.get(commentURL.format(newsid))
    jd = json.loads(comments.text.strip('var loader_1500398040328_33412017='))
    return jd['result']['count']['total']

#获取新闻详情的函数
def getNewsDetail(newsurl):
    result = {}
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    result['title'] = soup.select('#artibodyTitle')[0].text
    result['newssource'] = soup.select('.time-source span a')[0].text
    result['title'] = soup.select('#artibodyTitle')[0].text
    timesource = soup.select('.time-source')[0].contents[0].strip()
    result['dt'] = datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
    result['article'] = ' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
    result['editor'] = soup.select('.article-editor')[0].text.strip('责任编辑 :')
    result['conments'] = getComment(newsurl)
    return result
#解析列表链接
def parseListLinks(url):
    newsdetails = []
    res = requests.get(url)
    jd = json.loads(res.text.lstrip('  newsloadercallback(').rstrip(');'))
    for ent in jd['result']['data']:
        newsdetails.append(getNewsDetail(ent['url']))
    return newsdetails

#解析数据并放入数组
url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1500521276419'
news_total = []
for i in range(1,3):
    newsurl = url.format(i)
    newsary = parseListLinks(newsurl)
    news_total.extend(newsary)

# 数据文件化
import pandas
df = pandas.DataFrame(news_total)
df.head(3)
#导出为excel
df.to_excel('news.xlsx')

# 写入数据库
import sqlite3
with sqlite3.connect('news.qlite') as db:
    df.to_sql("news",con = db)

# 从数据库读取数据
with sqlite3.connect('news.qlite') as db:
    df2 = pandas.read_sql("SELECT * FROM news",con = db)
df2

你可能感兴趣的:(Python前几篇代码汇总)