正则、xpath、bs4用法

正则:
单字符匹配
. 除换行符之外的任意字符
\d 表示数字
\D 匹配非数字
\w 匹配单词字符[a-z,A-Z,0-9]
\W 匹配非单词字符
\s 匹配空白字符,空格,\n \t…
\S 匹配非空白字符
^ 匹配以…开头
$ 匹配以…结尾
[0-9] => \d 匹配0-9
多字符匹配(贪婪匹配)

  • 匹配*前面的字符任意次数
  • 匹配+前面的字符至少一次
    ?匹配?前面的字符0-1次
    {n,m}匹配{n,m}前面的字符n-m次
    多字符匹配(非贪婪匹配)
    *?
    +?
    ??
    其他
    ()分组
    |逻辑或
    \转义字符
    re模块下的方法
    re.compile():构建正则表达式对象
    re.match():从起始位开始匹配,单次匹配,如果匹配到结果立即返回,反之,返回None
    re.search():在整个字符串中进行匹配,单次匹配,如果匹配到结果立即返回,反之,返回None
    re.findall():匹配出整个字符串中,所有符合正则规则的结果,返回一个列表
    re.finditer():匹配出整个字符串中,所有符合正则规则的结果,返回的是一个可迭代对象
    re.sub():根据正则表达式进行字符串替换
    re.split():根据正则表达式进行分割
    正则的用法
    def get_rank_data(url='http://top.hengyan.com/dianji/default.aspx?p=1'):

    构建请求头

    headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    }

    url, \目标url

    data=None, \默认为None表示是get请求,如果不为None说明是get请求

    timeout 设置请求的超时时间

    cafile=None, capath=None, cadefault=False,:证书相关参数

    context=None :忽略证书认证

    urlopen不能添加请求头

    response = request.urlopen(url=url,timeout=10)

    添加请求头

    req = request.Request(url=url,headers=headers)
    response = request.urlopen(req,timeout=10)

    响应状态码

    code = response.status

    当前请求的url地址

    url = response.url
    print(code,url)

    b_content = response.read()

    bytes -> str: decode

    str -> bytes: encode

    print(b_content)

    html = b_content.decode('utf-8')

    print(html)

    #文件操作

    """

    w: w+: wb: wb+ a: a+: ab: ab+: r: rb:

    """

    with open('hengyan.html','w') as file:

    file.write(html)

    证据正则表达式解析数据

    re.S 修饰:表示.可以匹配换行符

    pattern = re.compile('(.*?)

',re.S)
ul_str = re.findall(pattern,html)[0]

pattern1 = re.compile('?>(.?)',re.S)
li_strs = re.findall(pattern1,ul_str)[1:]

for li_str in li_strs:
# print(li_str)
pattern = re.compile(
'(.?)'+
'.
??>(.?)'+
'.??>(.?)'+
'.
??>(.
?)'+
'.??>(.?)'+
'.
??>(.
?)',
re.S
)

  data = re.findall(pattern=pattern,string=li_str)[0]
  print(data)

提取下一页:

if '下一页' in html:
#说明还存在下一页
pattern = re.compile('(.*?)',re.S)
current_page = int(re.findall(pattern,html)[0])
next_page = current_page+1
#构造下一页的URL地址
next_page_url = re.sub('\d+',str(next_page),url)
print(next_page_url)
get_rank_data(next_page_url)
else:
print('数据提取完毕')

if name == 'main':

get_rank_data()

作者:某某某的洛先生
来源:CSDN
原文:https://blog.csdn.net/cc576795555/article/details/98338862
版权声明:本文为博主原创文章,转载请附上博文链接!

xpath:
XPath (XML Path Language) 是一门在 XML 文档中查找信息的语言,可用来在 XML 文档中对元素和属性进行遍历。
import requests
import re
import time
import urllib.parse
from lxml import etree

class MeiNv():
def init(self):
self.url = 'http://www.tieba.baidu.com/category/40076/?page='

    self.headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'

}

def loadpage(self, url):
response = requests.get(url=self.url)
html = response.content.decode('utf-8')

with open('baidu.html', 'w') as f:
    f.write(html)

#html转成xml
content = etree.HTML(html)
# print(content)
url_list = content.xpath(
    '//div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href'
)
# print(url_list)
for detail_url in url_list:
    full_url = 'http://tieba.baidu.com' + detail_url
    self.detail(full_url)

详情页

def detail(self, url):
response = requests.get(url=url)
html = response.content.decode('utf-8')
content = etree.HTML(html)
img_list = content.xpath(
'//img[@class="BDE_Image"]/@src'
)
for img in img_list:
self.download(img)

下载

def download(self, url):
response = requests.get(url=url)

无需decode

self.save(response.content, url)

保存

def save(self, content, img_url):
filename = 'tieba/' + img_url[-10:] + '.jpg'
print('正在下载' + filename)
with open(filename, 'wb') as f:
f.write(content)

def main(self):
kw = input('请输入网址')
start = int(input('输入起始页'))
end = int(input('输入终止页'))
for i in range(start, end + 1):

拼接

full_url = self.url + 'f?' + 'kw=' + kw + '&' + 'pn=' + str((i-1)*50)
self.loadpage(full_url)

if name == 'main':
mn = MeiNv()
mn.main()

bs4:
和 lxml 一样,Beautiful Soup 也是一个HTML/XML的解析器,主要的功能也是如何解析和提取 HTML/XML 数据。
lxml 只会局部遍历,而Beautiful Soup 是基于HTML DOM的,会载入整个文档,解析整个DOM树,因此时间和内存开销都会大很多,所以性能要低于lxml。
BeautifulSoup 用来解析 HTML 比较简单,API非常人性化,支持CSS选择器、Python标准库中的HTML解析器,也支持 lxml 的 XML解析器。
import requests
from bs4 import BeautifulSoup
import urllib.parse
import jsonpath
import json
import re
class QiDianSpider():
def init(self):
self.url = 'https://www.address.com/all?page=1'
self.headers = {
'User-Agent': 'Mozilla / 5.0(X11;Ubuntu;Linuxx86_64;rv: 54.0) Gecko / 20100101Firefox / 54.0'
}

def loadpage(self, url):
response = requests.get(url=url, headers=self.headers)
bs = BeautifulSoup(response.text, 'lxml')
li_list = bs.select('ul[class="all-img-list cf"] li')
for li in li_list:
title = li.select('div[class="book-mid-info"] h4 a')[0].get_text()
href = urllib.parse.urljoin(response.url, li.select('div[class="book-mid-info"] h4 a')[0].get('href'))
author = li.select('div[class="book-mid-info"] p a')[0].get_text()
type1 = li.select('div[class="book-mid-info"] p a')[1].get_text()
type2 = li.select('div[class="book-mid-info"] p a')[2].get_text()
id = re.search('(\d+)',href).group(1)
print(id)

    dict = {
        'title':title,
        'author':author,
        'type':type1+'.'+type2,
        'href':href,
        'id':id
    }
#     print(dict)
    self.loaddetail(id, dict)

def loaddetail(self, bookid, dict):
response = requests.get(url='https://book.qidian.com/ajax/book/category?_csrfToken=asYDuKBW3fwHjeBdQNcX1GFeE2B9KcEe6dJyt&bookId='+bookid, headers=self.headers)
html = response.content.decode('utf-8')

vs = jsonpath.jsonpath(json.loads(html), '$..vs')
count = sum(jsonpath.jsonpath(json.loads(html), '$..cnt'))
dict['vs'] = vs[0]
with open('qidian.html', 'w') as f:
    f.write(json.dumps(dict,ensure_ascii=False)+'\n')

def start(self):
self.loadpage(self.url)

if name == 'main':
qds = QiDianSpider()
qds.start()

你可能感兴趣的:(正则、xpath、bs4用法)