爬虫之路(5):爬虫中的常用代码集锦

  • 摘要
  • 请求
  • beautifulsoup系列
  • 正则系列
  • 过滤html标签,保留标签里的内容
  • 过滤script和style标签,标签里的内容也需过滤掉
  • 日期、时间的处理
  • base64编码与解码
  • html转义字符
  • url中的中文解码
  • 数据库操作

摘要

  • 请求url代码
  • beautifulsoup4常用的解析代码
  • 常用的正则解析代码
  • 过滤html标签、style标签和script标签的代码
  • 日期、时间的处理
  • base64编码与解码
  • html的转义字符
  • url中的中文编码
  • 数据库的常用操作

请求

import requests

url = "www.baidu.com"
resp = requests.get(url)
htmls = resp.text

beautifulsoup系列

from bs4 import BeautifulSoup
soup = BeautifulSoup(htmls, "lxml")

soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"}))

soup.find("div").get_text()

str(soup.find("div").get_text()).strip()

for i in soup.find_all("div",limit = 5)
    print(i.get_text())

正则系列

rollback({
    "response": {
        "code": "0",
        "msg": "Success",
        "dext": ""
    },
    "data": {
        "count": 3,
        "page": 1,
        "article_info": [{
            "title": "“小库里”:适应比赛是首要任务 投篮终会找到节奏",
            "url": "http:\/\/sports.qq.com\/a\/20180704\/035378.htm",
            "time": "2018-07-04 16:58:36",
            "column": "NBA",
            "img": "",
            "desc": ""
        }, {
            "title": "首钢体育助力国家冰球集训队 中国冰球联赛年底启动",
            "url": "http:\/\/sports.qq.com\/a\/20180704\/034698.htm",
            "time": "2018-07-04 16:34:44",
            "column": "综合体育",
            "img": "",
            "desc": ""
        }...]
    }
})

import re
# 提取这个json中的每条新闻的title、url
#(.*?)为要提取的内容,可以在正则字符串中加入.*?表示中间省略若干字符
reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,htmls)
for i in items:
    tilte = i[0]
    url = i[1]

过滤html标签,保留标签里的内容

import re

htmls = "

abc

"
dr = re.compile(r'<[^>]+>',re.S) htmls2 = dr.sub('',htmls) print(htmls2) #abc

过滤script和style标签,标签里的内容也需过滤掉

import requests
from bs4 import BeautifulSoup

url = "http://new.qq.com/omn/20180705/20180705A0920X.html"
r = requests.get(url)
htmls = r.text
soup = BeautifulSoup(htmls, "lxml")
for script in soup(["script", "style"]):   
    script.extract()
print(soup)

日期、时间的处理

import datetime
import time

# 获取当前年月日
today = datetime.date.today()
print(today)     #2018-07-05

# 获取当前时间并格式化
time_now = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print(time_now)     #2018-07-05 14:20:55

# 对某个时间戳a格式化
a = 1502691655
time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a))) 
print(time_a)       #2017-08-14 14:20:55

#将时间转化为时间戳
time_line = "2018-07-16 10:38:50"
time_tuple = time.strptime(time_line, "%Y-%m-%d %H:%M:%S")
time_line2 = int(time.mktime(time_tuple))

#字符串转为datetime类型
str = "2018-07-01 00:00:00"
datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")

#时间的计算

#明天的日期
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
print(tomorrow)     #2018-07-06

#三天前的时间
today = datetime.datetime.today()
tomorrow = today + datetime.timedelta(days=-3)
print(tomorrow)     #2018-07-02 13:37:00.107703

#计算时间差
start = "2018-07-03 00:00:00"
time_now = datetime.datetime.now()
b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S')
minutes  = (time_now-b).seconds/60
days = (time_now-b).days
all_minutes = days*24*60+minutes
print(minutes)      #821.7666666666667
print(days)     #2
print(all_minutes)      #3701.7666666666664

base64编码与解码

import base64

content = "abc124我是"
contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8")

contents = base64.b64decode(contents_base64)

html转义字符

from html.parser import HTMLParser

#htmls中带有&lt;div&gt;&lt;p&gt;等字符,可以转义为html标签
txt = HTMLParser().unescape(htmls)

url中的中文解码

  import urllib

  url = "www.baidu.com?wb =%e8%85"
  result_url = urllib.parse.unquote(soup3)

数据库操作

import pymysql

conn = pymysql.connect(host='10.0.8.81', port=3306, user='root', passwd='root',db='xxx', charset='utf8')
cur = conn.cursor()

insert_sql = "insert into tbl_name(id,name,age) values(%s,%s,%s)
id = 1
name = "like"
age = 26
data_list = []
data = (id,name,age)

# 单条插入
cur.execute(insert_sql,data)
conn.commit()

# 批量插入
data_list.append(data)
cur.executemany(insert_sql,data_list)
conn.commit()

#特殊字符处理(name中含有特殊字符)
data = (id,pymysql.escape_string(name),age)

#更新
update_sql = "update tbl_name set content = '%s' where id = "+str(id)
cur.execute(update_sql%(pymysql.escape_string(content)))
conn.commit()

#批量更新
update_sql = "UPDATE tbl_recieve SET content = %s ,title = %s , is_spider = %s WHERE id = %s"
update_data =  (contents,title,is_spider,one_new[0])
update_data_list.append(update_data)
if len(update_data_list) > 500:
try:
    cur.executemany(update_sql,update_data_list) 
    conn.commit() 

目前想到的用到的就这么多,方便查阅~ 欢迎补充~~!

你可能感兴趣的:(爬虫之路(5):爬虫中的常用代码集锦)