一个获取w3school上面SQL教程的Python爬虫

from bs4 import BeautifulSoup
import urllib.request

print ('Hello world')

header='http://www.w3school.com.cn'
follower='/sql/index.asp'
url=header+follower
end='http://www.w3school.com.cn/sql/sql_summary.asp'
title='教程'

#while url!=end :
print(url)
response = urllib.request.urlopen(url)
html=response.read()
soup=BeautifulSoup(html,'lxml')
fileHandle=open((title+'.html'),'w')
fileHandle.write(soup.prettify())
fileHandle.close()
#print(soup.prettify())
for link in soup.find_all('a'):
    follower = link.get('href')
    if follower.startswith('/sql'):
        print(follower)
        title=link.get('title')
        url=header+follower
        response = urllib.request.urlopen(url)
        html=response.read()
        fileHandle=open((title+'.html'),'wb')
        fileHandle.write(html)
        fileHandle.close()

    #提取下一页的后缀,更新url,更新title
    








#fileHandle=open('sql_update.html','w')
#fileHandle.write(soup.prettify())
#fileHandle.close()                                                
#html=html.decode('UTF-8')
#print(html)



#html=html.decode('UTF-8')

#print(soup.prettify())
'''print(soup.title)
for x in soup.find_all('a'):
    if x.get('title')!= None:
        print (x.get('title'))
'''
        
#if x.get('class')==
#print(soup.get_text())
#print(response)
#print(html)

自己写来抓所有的SQL相关内容的东西,用了beautifulSoup,感觉不错。确实可以开始学Python了,好玩=。=

应该再把html文件前面一段没什么用的东西截掉。然后是想办法把经过JS处理的html网页再扒下来,现在扒下来的都是原始的html网页只是刚好够用。


from bs4 import BeautifulSoup
import urllib.request

print ('Hello world')

header='http://www.w3school.com.cn'
follower='/sql/index.asp'
url=header+follower
end='http://www.w3school.com.cn/sql/sql_summary.asp'
title='教程'

print(url)
response = urllib.request.urlopen(url)
html=response.read()
soup=BeautifulSoup(html,'lxml')
#print(soup.prettify())
'''for link in soup.find_all('div'):
    if link.get('id') =='maincontent':
        print (link.prettify())
        str=link.prettify('gbk')
        fileHandle=open((title+'.html'),'wb')
        fileHandle.write(str)
        fileHandle.close()
'''
for link in soup.find_all('a'):
    follower = link.get('href')
    if follower.startswith('/sql'):
        print(follower)
        title=link.get('title')
        url=header+follower
        response = urllib.request.urlopen(url)
        html=response.read()
        tempSoup=BeautifulSoup(html,'lxml')
        for tempLink in tempSoup.find_all('div'):
            if tempLink.get('id')== 'maincontent':
                str=tempLink.prettify('gbk')#坑在这里
                fileHandle=open((title+'.html'),'wb')
                fileHandle.write(str)
                fileHandle.close()

改了一下内容,获取了所有网址主要的那部分。其中比较坑的是编码方式。BeautifulSoup会自动把html解析成Unicode编码,直接输出到文件里再用浏览器打开就是乱码

用prettify改成原来的编码方式就好了。

你可能感兴趣的:(Python)