python爬虫期末预习

某些需要知道的知识
正则表达式：
20\d* 匹配任意数字
http.?.html 匹配xxxx 输出整条匹配内容
xxx/(.?).xxx 匹配只输出( .? )\内的内容
\d 数字 \D非数字 \s 空白（空格制表符换页符） \S 非空白 \w匹配包括下划线的字母 \W 非字母 .任意所有数量
pattern1=r'https://www.xxx.com/a/(.?).html' 匹配项目书写规则
a=re.findall(pattern1,s3)
a=re.findall(需要匹配的内容,被匹配的内容)
实例：

import re
s1="This is a work day"
s2="Todat is 2019=11=20 to 2009-11-25"
s3="https://www.xxx.com/a/rrrrrk1111.html sajhd asfef afwasf w http://www.xxx.com/a/dew23r234k11.html http://www.xxx.com/a/r221.html"
pattern1=r'https*://www.xxx.com/a/(.*?)\.html'  #消除转义字符
pattern2=r'20\d*'
pattern3=r'http.*?\.html'
a=re.findall(pattern1,s3)
b=re.findall(pattern2,s2)
c=re.findall(pattern3,s3)
print(a)   ['rrrrrk1111', 'dew23r234k11', 'r221']
print(b)    ['2019', '20', '2009']
print(c)    ['https://www.xxx.com/a/rrrrrk1111.html', 'http://www.xxx.com/a/dew23r234k11.html', 'http://www.xxx.com/a/r221.html']

数据库的增删改查 mysql

---------查--------
select * from 表名  #查询表内所有内容
select 列1,列2 from #表名;  查询指定列的内容
select distinct 列…. From 表名;    #数据去重
select concat(列1,列2) from 表名  #拼接查询结果
select 列… from 表名 where 条件;    条件查询
##条件中比较运算符：( 等于:=  大于:>  大于等于:>=  小于:<  小于等于:<=  不等于:!= 或 <>  )
where 列 like '%0'     模糊查询 以0结尾
where 列 like  '%0%  数据包含 0    _任意字符
---------- 插入数据------
insert into 表名(字段1,字段2..) values(值1,值2…);    
insert into 表名 values(值1,值2)；   #全表所有字段进行插入
--------改---------
update 表 set 字段=值 where 条件;  //带条件修改指定数据，否则修改全表
--------删---------
delete from 表 where 条件;  //删除数据带条件指定数据，否则删除全表数据

实例：

import pymysql
def data():
    conn = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='library')
    #数据库连接语句
    c=conn.cursor()
    return c
def chashu():
    num=input("1.书名查找  2.ID查找")
    if num=="1":
        bookname = input("输入书名")
        bsql = "select * from book where book_name like '%{}'".format(bookname)
    elif num=="2":
        bid=input("输入id")
        b1=input("输入高价")
        b2=input("输入低价")
        bsql = "select * from books where book_id like '%{0}%' " \
               "and b_price between {1} and {2}".format(bid, b1, b2)
    c1=data()
    c1.execute(bsql)
    m=c1.fetchall()
    for i in m:
        print(i)
if __name__ == '__main__':
    while True:
        aa=input("1.查找 3.退出")
        if aa=="1":
            chashu()
        elif aa=="3":
            break

循环实例：

x=["Hello","world","aaa"]
abc=lambda x:len(x)
for itme in x:
    print(abc(itme))
# 输出第二个元素
abb1 = lambda x: x[1]
for aa1 in x:
        print(abb1(aa1))
# 输出最后一个元素
abb=lambda x:x[-1]
for aa in x:
    print(abb(aa))
# 输出“l”的个数
acb=lambda x:x.count("l")
for bb in x:
    print(acb(bb))



tuple1=(1,2,3,"a","b","c")
a1=list(tuple1)
a1.append("abc")
print(a1)  #[1, 2, 3, 'a', 'b', 'c', 'abc']
print(tuple(a1))  #(1, 2, 3, 'a', 'b', 'c', 'abc')

多线程实例：

from multiprocessing import Pool
import  time
import datetime
import  requests
import re

mis=["http://imga5.5054399.com/upload_pic/2019/10/25/4399_11081341008.jpg",
    "http://imga5.5054399.com/upload_pic/2019/7/1/4399_16544599503.jpg",
    "http://imga2.5054399.com/upload_pic/2019/10/24/4399_17424840649.jpg",
    "http://imga4.5054399.com/upload_pic/2019/11/7/4399_10042503678.jpg"]
def tupian(mis):
    pattren=r"4399_(.*)\.jpg"
    b=re.findall(pattren,mis)
    heads = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36"}
    res= requests.get(mis, headers=heads)
    with open("D:/pytupian/{0}.jpg".format(b[0]),"wb") as f:
       f.write(res.content)

m=[2,4,6]
def a(m):
    time.sleep(2)
    print(m*m)

if __name__ == '__main__':
    start = datetime.datetime.now()
    p = Pool(2)
    s = p.map(a, m)
    p.map(tupian, mis)
    p.close()
    p.join()
    e = datetime.datetime.now()
    print(e-start)

1.保存网页到本地
"w"写入 “wb”二进制写入
decode() 方法以 encoding 指定的编码格式解码字符串。默认编码为字符串编码。
伪装网页的

import requests   
aa={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) "#浏览器的伪装
                 "AppleWebKit/537.36 (KHTML, like Gecko) "
                 "Chrome/63.0.3239.26 Safari/537.36 "
                 "Core/1.63.6788.400 QQBrowser/10.3.2816.400"}
baiduLink="https://www.baidu.com"  #网站的地址
pic="https://www.baidu.com/img/bd_logo1.png"   #图片的地址

res =requests.get(baiduLink,headers=aa)  #爬取网站
res.content                                                 #二进制
with open("D:/python/q.html","w",encoding="utf-8")as f:     
    f.write(res.content.decode("utf8"))        #二进制解码
res =requests.get(pic,headers=aa)
with open("D:\python\q.png","wb")as f:
    f.write(res.content)

2，输入贴吧名，获取内容
find 只返回搜索的第一个对象
find_all 返回所有的匹配对象
soup.sellect 搜索所有匹配内容
其中搜索格式

搜索标签
print soup.select('title') 
#[The Dormouse's story]
搜索类名
print soup.select('.sister')
#[
通过ID查找
print soup.select('#link1')
#[
组合标签查询
print soup.select("head > title")
#[The Dormouse's story]
标签内属性查找
print soup.select('a[href="http://example.com/elsie"]')
#[]

查找实例

from bs4 import BeautifulSoup
import requests
aa={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) "
                 "AppleWebKit/537.36 (KHTML, like Gecko) "
                 "Chrome/63.0.3239.26 Safari/537.36 "
                 "Core/1.63.6788.400 QQBrowser/10.3.2816.400"}
tiebaName=input("请输入贴吧名\n")#输出文字  输入后赋值给tixxx
for i in range(10):#赋予i0-9共十个值
    url1="https://tieba.baidu.com/f?ie=utf-8&kw={0}&fr=search".format(tiebaName)
    res=requests.get(url1,headers=aa)
    fliePath="D:/python/贴吧.txt"
    with open(fliePath,"w",encoding="utf-8")as f:
        f.write(res.content.decode("utf-8"))
with open("D:/python/贴吧.txt","r",encoding="utf-8")as f:
    soup = BeautifulSoup(f, 'html.parser')    #html.parser解码方式
    a = soup.find('meta', {'name': 'description'})
    print(a['content'])  # 取属性值
    b = soup.find('title')
    print(b.string)  #取字符串
    c=soup.find_all('meta') #列表，定位的时候可以用列表索引
    print(c[1]['content'])
    d=soup.select('meta[name="description"]')
    print(d)

lista =soup.select('div#nav_menu > a')     #查找  div标签中  类名为 nva__xxx  中下级标签 a
for item in lista:
    # listNav.append(item['href'])
    print(item['href'],item.string)   #输出  href的内容    输出里面的文本内容

listb = soup.find('ul',{'class':'post_nav_block'})      #查找 第一个搜索到的ul标签 其 class值为post_nav_block
print(listb)
for item in listb:
    rul=item.find('a')                   #无标签a 及结束输出
    if rul==-1:
        continue
    print(rul['href'],rul['title'],rul.string)   #输出   第一个  herf的值    title的值    文本值

python爬虫期末预习

你可能感兴趣的:(python爬虫期末预习)