使用程序来获取我们需要的网络上的内容,如文字,视频,图片等信息 等。
Python3.7 Pycharm
url = "http://www.baidu.com"
#如果访问的网页地址有规律,可以批量的构造网页
#如:urls =['http://xa.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1,5)]
print("第一种方法")
response1 = urllib.request.urlopen(url)
# 获取状态码,200表示成功
print(response1.getcode())
# 获取网页内容的长度
print(len(response1.read()))
print("第二种方法")
request_1 = urllib.request.Request(url)
# 模拟Mozilla浏览器进行爬虫
request_1.add_header("user-agent", "Mozilla/5.0")
response2 = urllib.request.urlopen(request_1)
print(response2.getcode())
print(len(response2.read()))
print("第三种方法,使用第三方库requests")
def start_requests(url):
headers = {
'User - Agent': 'Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 63.0.3239.132 Safari / 537.36'
}
response = requests.get(url, headers=headers)
#获取状态码,200表示成功
print(response.status_code)
return response.content.decode()
html=start_requests(url)
print(len(html))
1.使用soup.select()函数搜索标签
def get_text(urls,data = None):
web_data = requests.get(urls).content.decode()
soup = BeautifulSoup(web_data, 'lxml')
# 获取页面标题
title = soup.select('title')
# 获取正文
texts = soup.select('div>p',class_="article-intro")#搜索p标签
#texts = soup.select('li')#s搜索li标签
#texts = soup.select('div>ul>li') # 与上面方法等价
print(len(texts))
for text in texts:
data = {
'text': text.get_text()#可以获取子标签内容(常用)
}
print(data)
get_text(url)
2.通过soup.find_all()搜索获取标签内容
links = soup.find_all('a')
print ("所有的链接")
for link in links:
print (link.name,link['href'],link.get_text())
3.通过soup.find()和正则表达式匹配解析
创建一个BeautifulSoup解析对象
soup = BeautifulSoup(html_doc,"html.parser",from_encoding="utf-8")
print ("获取特定的URL地址")
link_node = soup.find('a',href="http://example.com/elsie")
print (link_node.name,link_node['href'],link_node['class'],link_node.get_text())
print ("正则表达式匹配")
link_node = soup.find('a',href=re.compile(r"ti"))
print (link_node.name,link_node['href'],link_node['class'],link_node.get_text())
print ("获取P段落的文字")
p_node = soup.find('p',class_='story')
print (p_node.name,p_node['class'],p_node.get_text())
1.保存在txt中
f = open('content.txt', 'a+', encoding="utf-8") # 以追加形式打开文件
f.write(text.get_text())
f.close()
2.保存在excel中
#---部分代码----
import xlwt
import xlrd
r_xls = xlrd.open_workbook('xian.xls')
excel=copy(r_xls)
table=excel.get_sheet(0)
workbook = xlwt.Workbook(encoding='utf-8')
sheet1 = workbook.add_sheet('西安')
able.write(row, 0, row)
table.write(row, 1, title.string.strip())
table.write(row, 2, address.string.strip())
table.write(row, 3, url)
excel.save('xian.xls')
workbook.save('xian.xls')