爬取一些优美的中英双语短句
# 通过url获取网页
import urllib.request
def get_html(url):
# 要设置请求头,让服务器不知道是程序
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
html_content = response.read().decode("gbk")
return html_content
找到需要抓取内容的特征,上面网站的特征:
<p>
<br />
1、我的世界不允许你的消失,不管结局是否完美。<br />
No matter the ending is perfect or not, you cannot disappear from my world.p>
<p>
2、爱情是一个精心设计的谎言。<br />
Love is a carefully designed lie.p>
短语都是以”< p>”开始,以”< /p>”结尾
import re
def get_sentence(content):
content_list = re.findall('.*?
', content, re.S)
“开始,以”\
“结尾的内容,”.*?”的含义为:type1:
'\r\n\t
\r\n\t1、我的世界不允许你的消失,不管结局是否完美。
\r\n\tNo matter the ending is perfect or not, you cannot disappear from my world.
'
type2:
恋爱中,干傻事总是让人感到十分美妙。
\r\n\tIn love folly is always sweet."" data-snippet-id="ext.74a355223e48433da5a7cce13eabd2b6" data-snippet-saved="false" data-codota-status="done">"<p>\r\n\t66、<a href='http://www.siandian.com/lianaijiqiao/' target='_blank'><u>恋爱u>a>中,干傻事总是让人感到十分美妙。<br />\r\n\tIn love folly is always sweet.p>"
def clean_sentence(item_temp):
item_temp = item_temp.replace("\r\n\t
"
, "").replace("
\r\n\t", "&&").replace("", "").replace(""
, "").replace("\r\n\t", "")
item_temp = item_temp.split('、')
if len(item_temp) == 2:
item_temp = item_temp[1]
else:
# print(item_temp)
return ''
if "not in item_temp:
return item_temp + " &$\n"
return ''
清洗后的语句为(添加&& 和 &$用于之后拆分中英文语句):
我的世界不允许你的消失,不管结局是否完美。&&No matter the ending is perfect or not, you cannot disappear from my world. &$
.*?', content, re.S)
sentence_list = []
for item_loop in content_list:
item_loop = clean_sentence(item_loop)
if len(item_loop) > 0:
sentence_list.append(item_loop)
for show in sentence_list:
print(show)
return sentence_list
# 清洗语句
def clean_sentence(item_temp):
item_temp = item_temp.replace("\r\n\t
", "").replace("
\r\n\t", "&&")\
.replace("
", "").replace("
", "").replace("\r\n\t", "")
item_temp = item_temp.split('、')
if len(item_temp) == 2:
item_temp = item_temp[1]
else:
# print(item_temp)
return ''
if "# -*- coding: UTF-8 -*-
import re
import urllib.request
websites = ["http://www.siandian.com/haojuzi/1574.html"]
# 通过url获取网页
def get_html(url):
# 要设置请求头,让服务器知道不是机器人
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
html_content = response.read().decode("gbk")
return html_content
# 通过正则表达式来获取语句
def get_sentence(content):
content_list = re.findall('.*?
', content, re.S)
sentence_list = []
for item_loop in content_list:
item_loop = clean_sentence(item_loop)
if len(item_loop) > 0:
sentence_list.append(item_loop)
for show in sentence_list:
print(show)
return sentence_list
# 清洗语句
def clean_sentence(item_temp):
item_temp = item_temp.replace("\r\n\t
"
, "").replace("
\r\n\t", "&&")\
.replace("
", "").replace(""
, "").replace("\r\n\t", "")
item_temp = item_temp.split('、')
if len(item_temp) == 2:
item_temp = item_temp[1]
else:
# print(item_temp)
return ''
if "not in item_temp:
return item_temp + " &$\n"
return ''
if __name__ == '__main__':
html = get_html(websites[0])
get_sentence(html)