hax01tips

hax01tips
注意：本文之后极有可能设为私有
题目

hax01
Your mission is the following: Simply enter a URL into the box. The domain of the URL must be or end with 'nasa.gov'. The URL will be fetched right away. The content returned should contain the string: "2200178118" in the first 10 Kbytes of data. 404/403/etc error pages are not accepted. Remember, do not do anything illegal. Make sure you type the right URL, do not guess.

Hint: google is your friend.
http://google.com/search?q=site:nasa.gov

当时我的思路是找出所有以nasa.gov结尾的域名，然后遍历这些网址。之后我真的写了个python程序，取了google检索出来的前1000个页面，取出域名，保存起来，去除重复的有500多个。接着，读取html页面，判断是否还有字符串。其间，遇到了个网速的问题，超时后经常会跑到电信的114搜索上去。验证了170多个页面后，我发现自己理解错题目了，这里的URL并不是指URL以nasa.gov结尾，而是指URL的域名以nasa.gov结尾。我无语了，这相当于域名下的所有网页都有可能。这个工作量巨大得几乎是不可能的。暂时中止。
以下代码可供参考，修改了n次，可能现在已经没法直接运行。

2.5
1from urllib import FancyURLopener
2import urllib2
3import sys
4import re
5import locale
6"""
7class MyOpener(FancyURLopener):
8       version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9
10res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11
12myopener = MyOpener()
13url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14li = []
15for i in range(0, 10):
16    url = url + '&start=' + str(i*100)
17    page = myopener.open(url)
18    str1 = page.read()
19    for aItem in res.findall(str1):
20        if not aItem[0] in li:
21            li.append(aItem[0])
22"""
23with open('nasa.txt') as li:
24#li = open('nasa.txt')
25#print li.count
26    m = 0
27    for a in li:
28        #print 'http://'+a
29        m = m + 1
30        print m
31        url = a
32        req = urllib2.Request(url)
33        try:
34            response = urllib2.urlopen(req)
35            the_page = response.read()
36            with open(url + '.txt') as nasa:
37                write(the_page)
38            if the_page.find(r'daohang.118114.cn') <> -1 :
39                print '114'
40            elif the_page.find('2200178118', 0, 10240) <> -1 :
41                print url
42            else :
43                print '

'
44        except urllib2.URLError, e:
45            print e.reason
46
47"""
48#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
49#google = urllib.urlopen(gUrl)
50#str = google.read()
51for str in open('sitenasa_gov.htm'):
52    for aItem in res.findall(str):
53        print aItem[0]
54
55#print str
56str = 'www.xxx.nasa.gov/wwf.nasa.gov'
57
58"""
59
60
61#2200178118
62

3.1
1from urllib.request import FancyURLopener
2import urllib
3import sys
4import re
5import locale
6"""
7class MyOpener(FancyURLopener):
8       version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9
10res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11
12myopener = MyOpener()
13url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14li = []
15for i in range(0, 10):
16    url = url + '&start=' + str(i*100)
17    page = myopener.open(url)
18    str1 = page.read()
19    for aItem in res.findall(str1):
20        if not aItem[0] in li:
21            li.append(aItem[0])
22"""
23fiPath = sys.argv[1]
24with open(fiPath) as li:
25#li = open('nasa.txt')
26#print li.count
27    m = 0
28    for a in li:
29        #print 'http://'+a
30        m = m + 1
31        #print m
32        url = a
33        req = urllib.request.Request(url)
34        try:
35            response = urllib.request.urlopen(req)
36            the_page = response.read()
37            with open(url[7:-1] + '.txt', 'wb') as nasa:
38                nasa.write(the_page)
39                nasa.flush()
40            if the_page.decode('utf8').find(r'icc.qonc.com') != -1:
41                print('114')
42            elif the_page.decode('utf8').find('2200178118', 0, 10240) != -1:
43                print(url)
44            else :
45                print('

')
46        except urllib.error.URLError as e:
47            print(e.code)
48        except UnicodeDecodeError as UDE:
49            print(UDE)
50
51"""
52#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
53#google = urllib.urlopen(gUrl)
54#str = google.read()
55for str in open('sitenasa_gov.htm'):
56    for aItem in res.findall(str):
57        print aItem[0]
58
59#print str
60str = 'www.xxx.nasa.gov/wwf.nasa.gov'
61
62"""
63
64
65#2200178118
66

而后，过了大概几个月，变换思路，解决，意外的简单……
事实上，只要向服务器提交数据，一般服务器也会将该数据返回到页面上。该题最后的hint不是让我们来搜该域名，而是告诉我们怎样在google.com的页面上显示我们想要的数据。譬如 http://www.google.co.jp/search?q=2200178118 该页面的前10K里应该包含了该字符串。接下来，我们只需要在nasa.gov上找个页面提交数据就行了。
over

hax01tips

hax01tips

你可能感兴趣的:(hax01tips)