python : BeautifulSoup 网页 table 抓取实例

从 http://www.lottery.gov.cn/ 抓取 体彩 排列5 历史数据

python 2.7 : get_pl5.py

 

# -*- coding: utf-8 -*-
import os,sys
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup

# 体彩 排列5
URL = "http://www.lottery.gov.cn/historykj/history.jspx?_ltype=plw"
page = urllib2.urlopen(URL)
soup = BeautifulSoup(page)
page.close()

fp = open("pl5.txt","w")
tables = soup.findAll('table')
tab = tables[0]
for tr in tab.tbody.findAll('tr'):
    for td in tr.findAll('td'):
        text = td.getText().encode('cp936')+'!'
        fp.write(text)
    fp.write('\n')
#
fp.close()

python 3.7 : pip install beautilfulsoup4

# -*- coding: utf-8 -*-
import os,sys
from urllib import request
from bs4 import BeautifulSoup
# urllib.request.urlopen
# main()
if len(sys.argv) ==2:
    n = int(sys.argv[1])
else:
    print('usage: get_pl5.py n ')
    sys.exit(1)

# 体彩 排列5
URL = "http://www.lottery.gov.cn/historykj/history.jspx?_ltype=plw"
req = request.Request(URL, headers={
	'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'})
page = request.urlopen(req)
soup = BeautifulSoup(page, "html.parser")
page.close()

fp = open("pl5.txt","w")
tables = soup.findAll('table')
tab = tables[0]
for tr in tab.tbody.findAll('tr'):
    for td in tr.findAll('td'):
        text = td.getText() +'!'
        fp.write(text.replace(' ',''))
    fp.write('\n')
#

url ="http://www.lottery.gov.cn/historykj/history_{0}.jspx?_ltype=plw"
for i in range(2,n):
    URL = url.format(i)
    req = request.Request(URL, headers={
       'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'})
    page = request.urlopen(req)
    soup = BeautifulSoup(page, "html.parser")
    page.close()
    
    tables = soup.findAll('table')
    tab = tables[0]
    for tr in tab.tbody.findAll('tr'):
        for td in tr.findAll('td'):
            text = td.getText() +'!'
            fp.write(text.replace(' ',''))
        fp.write('\n')
    #
    print(i)
#
fp.close()

 

 

 

 

你可能感兴趣的:(python,BeautifulSoup,python)