###requests 基本函数
#import requests
#url='http://www.baidu.com'
#response = requests.get(url)
#print(response.status_code) #200
#print(type(response)) #
#print(response.headers) #{'Date': 'Mon, 21 Jan 2019 12:29:05 GMT', 'Connection': 'Keep-Alive'
#print(response.encoding) #ISO-8859-1 从http中猜测响应编码,,header中不存在charset,默认编码为这个
#print(response.apparent_encoding) #utf-8 备用编码
requests 基本框架
#通用代码框架格式
import requests
def getHtmlText(url):
try:
response=requests.get(url,timeout = 30)
response.raise_for_status() #如果状态不是200,引发HTTPERROR异常
response.encoding = response.apparent_encoding
return response.text
except:
return 'Exceptional error occurred'
if __name__ == '__main__':
url='requests.get("http://file.dl01.zxxk.com//OutFile/20190122/11265993164061984.doc?mkey=5e997e6187fcdc82217253aed14a5676705")'
print(getHtmlText(url))
图片下载基本框架
import requests,os
url = "http://image.ngchina.com.cn/2019/0122/20190122124507342.jpg"
root="D://pics//"
path=root+url.split("/")[-1]
try:
if not os.path.exists(root): #判断文件夹是否存在
os.mkdir(root)
if not os.path.exists(path): #判断文件不存在则下载文件
r=requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print('dowmload success')
else:
print("文件已存在")
except:
print("爬取失败")
用requests-bs4爬取中国大学网站:
#pip install beautifulsoup4
# from bs4 import BeautifulSoup #导入Bs4里面的BeautifulSoup类
# soup = BeautifulSoup("data","html.pasrser") #bs4的html解析,lxmnl的html,lxml的xml解析 pip install lxml
# print(soup.prettify())
#<>.find_all(name,attrs,recursive,string)
#name对标签名称检索字符串,attars对标签属性检索字符串,recursive,是否对所有子孙检索,默认True string ,检索<>..>中字符串区域
import requests,bs4
from bs4 import BeautifulSoup
def getHtmlText(url): #获取网页内容
try:
response = requests.get(url,timeout = 30)
response.status_code
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
return 'Exceptional error occurred'
def fillUnivList(html,uinfo): #提取网页内容信息
soup = BeautifulSoup(html,'html.parser')
for tr in soup.find('tbody').children: #找到tbody标签,然后对下面子孙进遍历
if isinstance(tr,bs4.element.Tag): #如果对象的类型与参数二的类型(Tag)相同则返回 True,否则返回 False。。
tds=tr('td')
uinfo.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string]) #string,tds[3].string
def printUnVlist(uinfo,num): #打印输出结果
tplt = "{}\t{}\t{}\t{}"
ulist=[]
print(tplt.format("排名","学校名称","地址","总分",chr(12288)))
for i in range(num):
u=ulist[i]
print(tplt.format(u[0],u[1],u[2],u[3],chr(12288)))
def main():
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
uinfo = []
html = getHtmlText(url)
fillUnivList(html,uinfo)
printUnVlist(uinfo,3) #打印N个排名
if __name__ == '__main__':
main()
# 正则表达式 用于表达字符串
# regex = re.compile(pattern,flage=0) ,pattern将表达式字符串或源生字符表示,flage 标记使用 I ,StopAsyncIteration
# 第一种 import 模块名
# 第二张 from 模块名 import 函数名,函数名
import requests,time
import random,re,json
def getHTMLText(url):
cookies=dict(cookies_are='cooks')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
proxies = ["115.218.222.64:9000","61.164.39.67:53281"] #没有使用字典的原因是 因为字典中的键是唯一的 http 和https 只能存在一个 所以不建议使用字典
try:
response=requests.get(url,timeout=50,cookies=cookies,proxies={'http':random.choice(proxies)},headers=headers)
print(response.raise_for_status())
response.encoding = response.apparent_encoding
print(response.url)
return response.text
except:
return "Exceptional error occurred"
def parsePage(infoList,html):
try:
price = re.findall(r'\"view_price\":\"[\d\.]*\"',html) #价格
title = re.findall(r'\"raw_title\"\:\".*?\"',html)#标题
address = re.findall(r'\"item_loc\"\:\".*?\"',html) # 地址
payments = re.findall(r'\"view_sales\"\:\".*?\"',html) #付款人数
ShopName = re.findall(r'\"nick\"\:\".*?\"',html) # 店铺名
PIC_img = re.findall(r'\"pic_url\"\:\".*?\"',html) #图片地址
for i in range(len(price)):
INfoDICT = {}
INfoDICT['view_price'] = eval(price[i].split(":")[1])
INfoDICT['raw_title']= eval(title[i].split(":")[1])
INfoDICT['item_loc'] = eval(address[i].split(":")[1])
INfoDICT['view_sales'] = eval(payments[i].split(":")[1])
INfoDICT['nick'] = eval(ShopName[i].split(":")[1])
INfoDICT['pic_url'] = eval(PIC_img[i].split(":")[1])
infoList.append(INfoDICT)
# print(INfoDICT)
except:
print("except Error")
def PrintGoodList(INfoDICT):
tplt = "{:14}\t{:4}\t{:4}\t{:4}\t{:8}\t{:8}\t{:30}"
print(tplt.format("序号", "商品名称", "价格","地址","付款人数","店铺名","图片地址"))
count = 0
for g in INfoDICT:
count = count + 1
print(tplt.format(count, g[1], g[0], g[2], g[3], g[4],g[5]))
def Getmain(keyword):
# keyword = input("请输入关键词:")
depth = 2
start_url = 'https://s.taobao.com/search?'+keyword
infoList = []
for i in range(depth):
try:
url = start_url+"&s="+str(44*i)
# time.time(1)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
json.dumps(infoList,ensure_ascii=False)
#PrintGoodList(infoList)
# INfoDICT['data'] = infoList
# main()