from bs4 import BeautifulSoup
import urllib.request
print ('Hello world')
header='http://www.w3school.com.cn'
follower='/sql/index.asp'
url=header+follower
end='http://www.w3school.com.cn/sql/sql_summary.asp'
title='教程'
#while url!=end :
print(url)
response = urllib.request.urlopen(url)
html=response.read()
soup=BeautifulSoup(html,'lxml')
fileHandle=open((title+'.html'),'w')
fileHandle.write(soup.prettify())
fileHandle.close()
#print(soup.prettify())
for link in soup.find_all('a'):
follower = link.get('href')
if follower.startswith('/sql'):
print(follower)
title=link.get('title')
url=header+follower
response = urllib.request.urlopen(url)
html=response.read()
fileHandle=open((title+'.html'),'wb')
fileHandle.write(html)
fileHandle.close()
#提取下一页的后缀,更新url,更新title
#fileHandle=open('sql_update.html','w')
#fileHandle.write(soup.prettify())
#fileHandle.close()
#html=html.decode('UTF-8')
#print(html)
#html=html.decode('UTF-8')
#print(soup.prettify())
'''print(soup.title)
for x in soup.find_all('a'):
if x.get('title')!= None:
print (x.get('title'))
'''
#if x.get('class')==
#print(soup.get_text())
#print(response)
#print(html)
自己写来抓所有的SQL相关内容的东西,用了beautifulSoup,感觉不错。确实可以开始学Python了,好玩=。=
应该再把html文件前面一段没什么用的东西截掉。然后是想办法把经过JS处理的html网页再扒下来,现在扒下来的都是原始的html网页只是刚好够用。
from bs4 import BeautifulSoup
import urllib.request
print ('Hello world')
header='http://www.w3school.com.cn'
follower='/sql/index.asp'
url=header+follower
end='http://www.w3school.com.cn/sql/sql_summary.asp'
title='教程'
print(url)
response = urllib.request.urlopen(url)
html=response.read()
soup=BeautifulSoup(html,'lxml')
#print(soup.prettify())
'''for link in soup.find_all('div'):
if link.get('id') =='maincontent':
print (link.prettify())
str=link.prettify('gbk')
fileHandle=open((title+'.html'),'wb')
fileHandle.write(str)
fileHandle.close()
'''
for link in soup.find_all('a'):
follower = link.get('href')
if follower.startswith('/sql'):
print(follower)
title=link.get('title')
url=header+follower
response = urllib.request.urlopen(url)
html=response.read()
tempSoup=BeautifulSoup(html,'lxml')
for tempLink in tempSoup.find_all('div'):
if tempLink.get('id')== 'maincontent':
str=tempLink.prettify('gbk')#坑在这里
fileHandle=open((title+'.html'),'wb')
fileHandle.write(str)
fileHandle.close()
用prettify改成原来的编码方式就好了。