某些需要知道的知识
正则表达式 :
20\d* 匹配任意数字
http.?.html 匹配xxxx 输出整条匹配内容
xxx/(.?).xxx 匹配 只输出( .? )\内的内容
\d 数字 \D非数字 \s 空白(空格 制表符 换页符) \S 非空白 \w匹配包括下划线的字母 \W 非字母 .任意 所有数量
pattern1=r'https://www.xxx.com/a/(.?).html' 匹配项目书写规则
a=re.findall(pattern1,s3)
a=re.findall(需要匹配的内容,被匹配的内容)
实例:
import re
s1="This is a work day"
s2="Todat is 2019=11=20 to 2009-11-25"
s3="https://www.xxx.com/a/rrrrrk1111.html sajhd asfef afwasf w http://www.xxx.com/a/dew23r234k11.html http://www.xxx.com/a/r221.html"
pattern1=r'https*://www.xxx.com/a/(.*?)\.html' #消除转义字符
pattern2=r'20\d*'
pattern3=r'http.*?\.html'
a=re.findall(pattern1,s3)
b=re.findall(pattern2,s2)
c=re.findall(pattern3,s3)
print(a) ['rrrrrk1111', 'dew23r234k11', 'r221']
print(b) ['2019', '20', '2009']
print(c) ['https://www.xxx.com/a/rrrrrk1111.html', 'http://www.xxx.com/a/dew23r234k11.html', 'http://www.xxx.com/a/r221.html']
数据库的增删改查 mysql
---------查--------
select * from 表名 #查询表内所有内容
select 列1,列2 from #表名; 查询指定列的内容
select distinct 列…. From 表名; #数据去重
select concat(列1,列2) from 表名 #拼接查询结果
select 列… from 表名 where 条件; 条件查询
##条件中比较运算符:( 等于:= 大于:> 大于等于:>= 小于:< 小于等于:<= 不等于:!= 或 <> )
where 列 like '%0' 模糊查询 以0结尾
where 列 like '%0% 数据包含 0 _任意字符
---------- 插入数据------
insert into 表名(字段1,字段2..) values(值1,值2…);
insert into 表名 values(值1,值2); #全表所有字段进行插入
--------改---------
update 表 set 字段=值 where 条件; //带条件修改指定数据,否则修改全表
--------删---------
delete from 表 where 条件; //删除数据带条件指定数据,否则删除全表数据
实例:
import pymysql
def data():
conn = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='library')
#数据库连接语句
c=conn.cursor()
return c
def chashu():
num=input("1.书名查找 2.ID查找")
if num=="1":
bookname = input("输入书名")
bsql = "select * from book where book_name like '%{}'".format(bookname)
elif num=="2":
bid=input("输入id")
b1=input("输入高价")
b2=input("输入低价")
bsql = "select * from books where book_id like '%{0}%' " \
"and b_price between {1} and {2}".format(bid, b1, b2)
c1=data()
c1.execute(bsql)
m=c1.fetchall()
for i in m:
print(i)
if __name__ == '__main__':
while True:
aa=input("1.查找 3.退出")
if aa=="1":
chashu()
elif aa=="3":
break
循环实例:
x=["Hello","world","aaa"]
abc=lambda x:len(x)
for itme in x:
print(abc(itme))
# 输出第二个元素
abb1 = lambda x: x[1]
for aa1 in x:
print(abb1(aa1))
# 输出最后一个元素
abb=lambda x:x[-1]
for aa in x:
print(abb(aa))
# 输出“l”的个数
acb=lambda x:x.count("l")
for bb in x:
print(acb(bb))
tuple1=(1,2,3,"a","b","c")
a1=list(tuple1)
a1.append("abc")
print(a1) #[1, 2, 3, 'a', 'b', 'c', 'abc']
print(tuple(a1)) #(1, 2, 3, 'a', 'b', 'c', 'abc')
多线程实例:
from multiprocessing import Pool
import time
import datetime
import requests
import re
mis=["http://imga5.5054399.com/upload_pic/2019/10/25/4399_11081341008.jpg",
"http://imga5.5054399.com/upload_pic/2019/7/1/4399_16544599503.jpg",
"http://imga2.5054399.com/upload_pic/2019/10/24/4399_17424840649.jpg",
"http://imga4.5054399.com/upload_pic/2019/11/7/4399_10042503678.jpg"]
def tupian(mis):
pattren=r"4399_(.*)\.jpg"
b=re.findall(pattren,mis)
heads = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36"}
res= requests.get(mis, headers=heads)
with open("D:/pytupian/{0}.jpg".format(b[0]),"wb") as f:
f.write(res.content)
m=[2,4,6]
def a(m):
time.sleep(2)
print(m*m)
if __name__ == '__main__':
start = datetime.datetime.now()
p = Pool(2)
s = p.map(a, m)
p.map(tupian, mis)
p.close()
p.join()
e = datetime.datetime.now()
print(e-start)
1.保存网页到本地
"w"写入 “wb”二进制写入
decode() 方法以 encoding 指定的编码格式解码字符串。默认编码为字符串编码。
伪装网页的
import requests
aa={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) "#浏览器的伪装
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/63.0.3239.26 Safari/537.36 "
"Core/1.63.6788.400 QQBrowser/10.3.2816.400"}
baiduLink="https://www.baidu.com" #网站的地址
pic="https://www.baidu.com/img/bd_logo1.png" #图片的地址
res =requests.get(baiduLink,headers=aa) #爬取网站
res.content #二进制
with open("D:/python/q.html","w",encoding="utf-8")as f:
f.write(res.content.decode("utf8")) #二进制解码
res =requests.get(pic,headers=aa)
with open("D:\python\q.png","wb")as f:
f.write(res.content)
2,输入贴吧名,获取内容
find 只返回搜索的第一个对象
find_all 返回所有的匹配对象
soup.sellect 搜索所有匹配内容
其中搜索格式
搜索标签
print soup.select('title')
#[The Dormouse's story ]
搜索类名
print soup.select('.sister')
#[
通过ID查找
print soup.select('#link1')
#[
组合标签查询
print soup.select("head > title")
#[The Dormouse's story ]
标签内属性查找
print soup.select('a[href="http://example.com/elsie"]')
#[]
查找实例
from bs4 import BeautifulSoup
import requests
aa={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/63.0.3239.26 Safari/537.36 "
"Core/1.63.6788.400 QQBrowser/10.3.2816.400"}
tiebaName=input("请输入贴吧名\n")#输出文字 输入后赋值给tixxx
for i in range(10):#赋予i0-9共十个值
url1="https://tieba.baidu.com/f?ie=utf-8&kw={0}&fr=search".format(tiebaName)
res=requests.get(url1,headers=aa)
fliePath="D:/python/贴吧.txt"
with open(fliePath,"w",encoding="utf-8")as f:
f.write(res.content.decode("utf-8"))
with open("D:/python/贴吧.txt","r",encoding="utf-8")as f:
soup = BeautifulSoup(f, 'html.parser') #html.parser解码方式
a = soup.find('meta', {'name': 'description'})
print(a['content']) # 取属性值
b = soup.find('title')
print(b.string) #取字符串
c=soup.find_all('meta') #列表,定位的时候可以用列表索引
print(c[1]['content'])
d=soup.select('meta[name="description"]')
print(d)
lista =soup.select('div#nav_menu > a') #查找 div标签中 类名为 nva__xxx 中下级标签 a
for item in lista:
# listNav.append(item['href'])
print(item['href'],item.string) #输出 href的内容 输出里面的文本内容
listb = soup.find('ul',{'class':'post_nav_block'}) #查找 第一个搜索到的ul标签 其 class值为post_nav_block
print(listb)
for item in listb:
rul=item.find('a') #无标签a 及结束输出
if rul==-1:
continue
print(rul['href'],rul['title'],rul.string) #输出 第一个 herf的值 title的值 文本值