# http: //dict.youdao.com/search?q=hello&tab=chn&keyfrom=dict.result can' use it , be-
# cause it is python's bug
import urllib
from BeautifulSoup import BeautifulSoup
import sys
global file
def getWebContent(url, word):
html = urllib.urlopen(url).read()
#html = html.decode( "gb2312", "ignore").encode( "utf-8", "ignore")
html = unicode(html, "gb2312", "ignore").encode( "utf-8", "ignore")
soup = BeautifulSoup(html)
#filter 1
data = str(soup.find( "div", { "class": "explain"}))
#strContent = data.renderContents()+ "\n" # default the string s is coded with ASCII
# but the original is UTF-8, because the
# beautifulSoup use it...
#fileter 2
soup = BeautifulSoup(data)
# beautifulsoup generator http: //www.crummy.com/software/BeautifulSoup/documentation.zh.html#Generators
outtext=''.join([element for element in soup.recursiveChildGenerator() if isinstance(element,unicode)])
#make some rendering
for item in range(1,10):
outtext=outtext.replace(str(item), "\n%s" % str(item))
outtext=outtext.replace( " ", "\n")
outtext =word + ":\n" +outtext + "\n"
file.write(outtext)
print outtext.decode( "utf-8").encode( "gbk")
def word_FromFile():
file = open( "F:/Whu/EnghlishWords.txt", "r")
for word in file.readlines():
print isinstance(word, unicode)
print word.decode( "utf-8")
#must be carefully!!!
#because we use the utf-8 to store the Chinese words in notepad
#it will add another 3 words to mark
# if file[:3] == codes.BOM_UTF8;
# data = data[3:]
# print data.decode( "utf-8")
url = "http://dict.baidu.com/s?wd=%s" % word
getWebContent(url, word)
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
file = open("F:/Whu/EnghlishWords_translate.txt",'w')
word_FromFile()
file.flush()
file.close()