【实例】python bs4 beautifulsoup + urllib.request 提取网址

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> page=requests.get(url).text
>>> pagesoup=BeautifulSoup(page,'lxml')
>>> for link  in pagesoup.find_all(name='a',attrs={"href":re.compile(r'^http:')}):
...     print(link.get('href'))
...
http://www.szu.edu.cn
http://news.szu.edu.cn
http://210.39.3.155:9090/goLogin.do
http://www.szu.edu.cn/yxjg/xyxb.htm
http://www.szu.edu.cn/yxjg/znbm.htm
http://www.miibeian.gov.cn

>>>

--------------------------------

>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")

>>> print(soup.get_text())


---------------------

>>> # -*- coding: utf-8 -*-
...
>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")

>>> print(soup.get_text())

-----------------------------------

>>> webdata = requests.get(url)
>>> webdata.encoding = 'GBK'

>>> print(webdata.text)

----------------------------

【实例】python bs4 beautifulsoup + urllib.request 提取网址_第1张图片

>>> print(html)

>>> print(html.text)

html_doc = html.text

>>> html_doc = html.text

>>> print(html_doc)

--------------------------------------------

>>> soup = BeautifulSoup(html_doc, "html.parser")
>>> didi = soup.b.next_element.strip()
Traceback (most recent call last):
  File "", line 1, in
TypeError: 'NoneType' object is not callable
>>> didi = soup.tr.next_element.strip()
>>> print(didi)


>>> didi = soup.br.next_element.strip()

>>> print(didi)

----------------------------------------------

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>>

----------------------------------------------

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>> res_tr = r'(.*?)'
>>> m_tr =  re.findall(res_tr,html,re.S|re.M)


>>> for line in m_tr:

...   print(line)
...   res_th = r'(.*?)'
...   m_th = re.findall(res_th,line,re.S|re.M)
...   for mm in m_th:
...     print(unicode(mm,'utf-8'),)
...   res_td = r'(.*?)'
...   m_td = re.findall(res_td,line,re.S|re.M)
...   for nn in m_td:
...     print(unicode(nn,'utf-8'))
...


















深大官网 English 


(.*?)'
>>> m_tr =  re.findall(res_tr,html,re.S|re.M)
>>> for line in m_tr:
...   print(line)
...   res_th = r''
...   m_th = re.findall(res_th,line,re.S|re.M)
...   for mm in m_th:
...     print(unicode(mm,'utf-8'),)
...   res_td = r''
...   m_td = re.findall(res_td,line,re.S|re.M)
...   for nn in m_td:
...     print(unicode(nn,'utf-8'))
...

--------------下面是获取的超链接文本-----------------------------

>>> res = r'(.*?)'
>>> mm = re.findall(res,html,re.S|re.M)
>>> for value in mm:

...   print(value)

【实例】python bs4 beautifulsoup + urllib.request 提取网址_第2张图片

-------------------

突然在想深度的问题 :参考文章:http://blog.csdn.net/u012063507/article/details/72831751


你可能感兴趣的:(python,txt,爬虫,urllib,bs4,BeautifulSoup)




   您未登录|点击统一身份认证

----------------上面是关于 td tr th 的信息提取--------------------

>>> import re
>>> from bs4 import BeautifulSoup
>>> import urllib.request
>>> import lxml
>>> import requests
>>> url = 'http://www1.szu.edu.cn/board/'
>>> html = requests.get(url)
>>> soup = BeautifulSoup(html.text,"lxml")
>>> html.encoding='GBK'
>>> html = html.text
>>> bs_obj = bs4.BeautifulSoup(html)
>>> bs_a_tag = bs_obj.findAll('a')
>>> print(bs_a_tag[0].text)
深大官网
>>> res_tr = r'

(.*?)(.*?)