救命代码python警察吧mysql

警察吧

# -*- coding: utf-8 -*-

import requests

import re

def get_page(url):

    # 定义请求头信息

    headers = {

        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36'}

    # proxies = {

    #     'http': 'http://59.172.27.6:38380',

    #     'https': 'https://59.172.27.6:38380'

    # }

    try:

        response = requests.get(url)# 发送网络请求

        if response.status_code == 200:  # 判断请求是否成功

            return response.text  # 以文本形式返回整个HTML页面

    except:

        print('请求页面错误!!!')

# 定义解析贴吧网页的爬虫函数,用来获取"police吧"帖子标题、作者、链接和创建时间

def get_posts_info(html):

    posts_title = re.findall(r'href="/p/\d+" title="(.+?)"',html)  # 帖子标题

    posts_author = re.findall(r'title="主题作者:(.+?)"', html) # 帖子作者

    posts_href = re.findall(r'href="(.+?)"', html)  # 帖子链接

    post_createtime = re.findall(r'title="创建时间">(.+?)<', html)  # 帖子创建时间

    print('帖子标题:', posts_title)

    print('帖子作者:', posts_author)

    print('帖子链接:', posts_href)

    print('帖子创建时间:', post_createtime)

    posts_href_complete=['https://tieba.baidu.com/p'+i for i in posts_href]

    zipped=zip(posts_title,posts_author,posts_href_complete,post_createtime)

    return zipped

def save_as_txt(zipped):

    with open('posts.txt','a+',encoding='utf-8') as f:

        for i in zipped:

            f.write('\t'.join(i))

            f.write('\n')

    f.close()

# 程序入口

if __name__ == '__main__':

    base_url = 'https://tieba.baidu.com/f?kw=police&ie=utf-8&pn={}'  # "police吧"基础URL地址

    for i in range(0, 250, 50):  # 每页间隔50,实现循环,共5页

        page_url = base_url.format(i)  # 通过format替换切换页码的URL地址

        html = get_page(page_url)  # 调用请求页面的函数,获取整个HTML页面

        posts_data = get_posts_info(html)  # 调用解析贴吧网页的爬虫函数,获取"police"贴吧帖子标题、作者、链接和创建时间

        save_as_txt(posts_data)

豆瓣

import csv

import requests

from bs4 import BeautifulSoup

import re

from urllib.parse import urlencode

import pandas as pd

base_url = 'https://www.douban.com/gallery/'

def getHTMLText(url):

    try:

        headers = {"Referer":"https://www.douban.com/gallery/",

                   "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

        cookies={'cookie':'ll="118172"; bid=Vy6Pz32LzDw; douban-fav-remind=1; __gads=ID=d17a12675dd38b6a-2238d4679fd800f7:T=1669073554:RT=1669073554:S=ALNI_MYyGsJx5Y8_xL0yEAIBh0sK2PhdcQ; viewed="3260214"; gr_user_id=ff962b83-284a-4b2c-8e20-d9f161fe0963; __utmz=30149280.1678271238.6.5.utmcsr=so.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __gpi=UID=00000b80e0dabc90:T=1669073554:RT=1678271453:S=ALNI_MYB5epDed1oBb_UiWO2fP8tMCs6aA; _pk_ref.100001.8cb4=["","",1680604681,"https://movie.douban.com/subject/1307914/comments?status=P"]; _pk_id.100001.8cb4=6365e7660926ca38.1669073494.; _pk_ses.100001.8cb4=1; ap_v=0,6.0; __utma=30149280.898744624.1652660662.1678271238.1680604682.7; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1680604682'}

        #r = requests.get(url)

        r = requests.get(url,headers=headers,cookies=cookies,timeout= 10)

        #r.raise_for_status()

        r.encoding = r.apparent_encoding

        return r.json()

    except:

        print("fail")

def get_topics(html):

    topics_list=[]

    if html['items']:

        diary_title=[]

        diary_abstract=[]

        diary_url=[]

        diary_author=[]

        topic_sharing_url=[]

        topic_name=[]

        diary_author_url=[]

        for item in html.get('items'):

            diary=item.get('target')

            diary_title.append(diary.get('title'))

      diary_abstract.append(diary.get('abstract'))

            diary_url.append(diary.get('url'))

            diary_author.append(diary.get('author').get('name'))

            topic_sharing_url.append(item.get('topic').get('sharing_url'))

            topic_name.append(item.get('topic').get('name'))

            diary_author_url.append(diary.get('author').get('url'))

        global data_list

        data_list=[]

        for a,b,c,d,e,f,g in zip(diary_title,diary_abstract,diary_url,diary_author,topic_sharing_url,topic_name,diary_author_url):

            x={}

            x['diary_title']=a

            x['diary_abstract']=b

            x['diary_url']=c

            x['diary_author']=d

            x['topic_sharing_url']=e

            x['topic_name']=f

            x['diary_author_url']=g

            data_list.append(x)

                    

base_url = 'https://m.douban.com/rexxar/api/v2/gallery/hot_items?'

def save_as_csv(data):

    df=pd.DataFrame(data)

    df.to_csv(r'/home/qingjiao/topics.csv',header=False,mode=’a’)

for start in range(20,120,20):

    params={

            "ck":"null",

            "start":start,

            "count":20

            }

    url=base_url+urlencode(params)

    html=getHTMLText(url)

    get_topics(html)

    #print(html)

    save_as_csv(data_list)

mysql代码

create table 行为数据 as

select distinct * from `用户交易行为数据`

SELECT 行为类型,COUNT(交易单号) 行为次数,WEEKDAY(行为发生时间)+1 星期  from 行为数据 GROUP BY 行为类型,星期

select 行为类型,count(交易单号) 行为的次数 from 行为数据

select 行为类型,count(交易单号) 行为次数,hour(行为发生时间)时段 from 行为数据 GROUP BY 行为类型,时段

order by count(交易单号) desc

SELECT 时段,COUNT(*) FROM 行为数据2

GROUP BY 时段

ORDER BY COUNT(*) DESC

SELECT b.证件号,b.姓名,COUNT(b.证件号) from 上网数据 a,上网数据 b

    WHERE a.姓名='王五五' and a.上网账号=b.上网账号 and a.上网日期=b.上网日期 and a.证件号!=b.证件号

    GROUP BY b.证件号,b.姓名

    HAVING COUNT(b.证件号)>=5

你可能感兴趣的:(python,mysql,开发语言)