警察吧
# -*- coding: utf-8 -*-
import requests
import re
def get_page(url):
# 定义请求头信息
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36'}
# proxies = {
# 'http': 'http://59.172.27.6:38380',
# 'https': 'https://59.172.27.6:38380'
# }
try:
response = requests.get(url)# 发送网络请求
if response.status_code == 200: # 判断请求是否成功
return response.text # 以文本形式返回整个HTML页面
except:
print('请求页面错误!!!')
# 定义解析贴吧网页的爬虫函数,用来获取"police吧"帖子标题、作者、链接和创建时间
def get_posts_info(html):
posts_title = re.findall(r'href="/p/\d+" title="(.+?)"',html) # 帖子标题
posts_author = re.findall(r'title="主题作者:(.+?)"', html) # 帖子作者
posts_href = re.findall(r'href="(.+?)"', html) # 帖子链接
post_createtime = re.findall(r'title="创建时间">(.+?)<', html) # 帖子创建时间
print('帖子标题:', posts_title)
print('帖子作者:', posts_author)
print('帖子链接:', posts_href)
print('帖子创建时间:', post_createtime)
posts_href_complete=['https://tieba.baidu.com/p'+i for i in posts_href]
zipped=zip(posts_title,posts_author,posts_href_complete,post_createtime)
return zipped
def save_as_txt(zipped):
with open('posts.txt','a+',encoding='utf-8') as f:
for i in zipped:
f.write('\t'.join(i))
f.write('\n')
f.close()
# 程序入口
if __name__ == '__main__':
base_url = 'https://tieba.baidu.com/f?kw=police&ie=utf-8&pn={}' # "police吧"基础URL地址
for i in range(0, 250, 50): # 每页间隔50,实现循环,共5页
page_url = base_url.format(i) # 通过format替换切换页码的URL地址
html = get_page(page_url) # 调用请求页面的函数,获取整个HTML页面
posts_data = get_posts_info(html) # 调用解析贴吧网页的爬虫函数,获取"police"贴吧帖子标题、作者、链接和创建时间
save_as_txt(posts_data)
豆瓣
import csv
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlencode
import pandas as pd
base_url = 'https://www.douban.com/gallery/'
def getHTMLText(url):
try:
headers = {"Referer":"https://www.douban.com/gallery/",
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
cookies={'cookie':'ll="118172"; bid=Vy6Pz32LzDw; douban-fav-remind=1; __gads=ID=d17a12675dd38b6a-2238d4679fd800f7:T=1669073554:RT=1669073554:S=ALNI_MYyGsJx5Y8_xL0yEAIBh0sK2PhdcQ; viewed="3260214"; gr_user_id=ff962b83-284a-4b2c-8e20-d9f161fe0963; __utmz=30149280.1678271238.6.5.utmcsr=so.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __gpi=UID=00000b80e0dabc90:T=1669073554:RT=1678271453:S=ALNI_MYB5epDed1oBb_UiWO2fP8tMCs6aA; _pk_ref.100001.8cb4=["","",1680604681,"https://movie.douban.com/subject/1307914/comments?status=P"]; _pk_id.100001.8cb4=6365e7660926ca38.1669073494.; _pk_ses.100001.8cb4=1; ap_v=0,6.0; __utma=30149280.898744624.1652660662.1678271238.1680604682.7; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1680604682'}
#r = requests.get(url)
r = requests.get(url,headers=headers,cookies=cookies,timeout= 10)
#r.raise_for_status()
r.encoding = r.apparent_encoding
return r.json()
except:
print("fail")
def get_topics(html):
topics_list=[]
if html['items']:
diary_title=[]
diary_abstract=[]
diary_url=[]
diary_author=[]
topic_sharing_url=[]
topic_name=[]
diary_author_url=[]
for item in html.get('items'):
diary=item.get('target')
diary_title.append(diary.get('title'))
diary_abstract.append(diary.get('abstract'))
diary_url.append(diary.get('url'))
diary_author.append(diary.get('author').get('name'))
topic_sharing_url.append(item.get('topic').get('sharing_url'))
topic_name.append(item.get('topic').get('name'))
diary_author_url.append(diary.get('author').get('url'))
global data_list
data_list=[]
for a,b,c,d,e,f,g in zip(diary_title,diary_abstract,diary_url,diary_author,topic_sharing_url,topic_name,diary_author_url):
x={}
x['diary_title']=a
x['diary_abstract']=b
x['diary_url']=c
x['diary_author']=d
x['topic_sharing_url']=e
x['topic_name']=f
x['diary_author_url']=g
data_list.append(x)
base_url = 'https://m.douban.com/rexxar/api/v2/gallery/hot_items?'
def save_as_csv(data):
df=pd.DataFrame(data)
df.to_csv(r'/home/qingjiao/topics.csv',header=False,mode=’a’)
for start in range(20,120,20):
params={
"ck":"null",
"start":start,
"count":20
}
url=base_url+urlencode(params)
html=getHTMLText(url)
get_topics(html)
#print(html)
save_as_csv(data_list)
mysql代码
create table 行为数据 as
select distinct * from `用户交易行为数据`
SELECT 行为类型,COUNT(交易单号) 行为次数,WEEKDAY(行为发生时间)+1 星期 from 行为数据 GROUP BY 行为类型,星期
select 行为类型,count(交易单号) 行为的次数 from 行为数据
select 行为类型,count(交易单号) 行为次数,hour(行为发生时间)时段 from 行为数据 GROUP BY 行为类型,时段
order by count(交易单号) desc
SELECT 时段,COUNT(*) FROM 行为数据2
GROUP BY 时段
ORDER BY COUNT(*) DESC
SELECT b.证件号,b.姓名,COUNT(b.证件号) from 上网数据 a,上网数据 b
WHERE a.姓名='王五五' and a.上网账号=b.上网账号 and a.上网日期=b.上网日期 and a.证件号!=b.证件号
GROUP BY b.证件号,b.姓名
HAVING COUNT(b.证件号)>=5