获得alexa网站下面美国排名前二十的网址

#!/usr/bin/python
#encoding:utf8
#author:cosme

import re
import urllib

def getTop25ReginalSties():

    GETTOP25 = re.compile(r'^\<a\s+href\=\"/siteinfo/(?P<gettop>\S+)\"\>\S+\</a\>.*')
    url = 'http://www.alexa.com/topsites/countries/AM'
    urllist = []
    i = 0
    html= urllib.urlopen(url).read()
    file2 = open('file04.txt','w')
    file2.write(html)
    file2.close()

    file3 = open('file04.txt','r')
    while True:

        line = file3.readline()
        if not line:
            break


        mo = GETTOP25.match(line)
        if mo:
          
            urltext1 = mo.group('gettop')

            print urltext1
            urllist.append(urltext1)

            print len(urllist)

getTop25ReginalSties()
 
 
 

你可能感兴趣的:(获得alexa网站下面美国排名前二十的网址)