参考地址:https://cuiqingcai.com/1972.html
获取页面:https://iask.sina.com.cn/c/74.html
分析:获取每个问题的标题及答案地址,访问答案页面获取问题具体内容,答案内容,作答者,作答时间,构造成字典形式保存进数据库
主程序:
def main():
datadict = get_html(url)
insertdata(iaskdb,datadict)
实现代码:
字典键值惟一,主键相同的,后者会覆盖前者
for q_detail in q_items:
#print(q_detail[0],q_detail[1])
#self.q_url_lists.append(q_url)
q_dict['qtitle'] = q_detail[0]
q_dict['qurl'] = q_detail[1]
q_index += 1
countnum += 1
self.q_dictlist.append(q_dict)
运行以上代码会产生相同的值导致数据库不全
修改后代码:
class IASK:
def __init__(self):
self.headers = { 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' }
self.url_base = 'https://iask.sina.com.cn'
self.q_db = iasksql.Iasksql()
def get_html(self,url):
try:
myreq = urllib.request.Request(url,headers = self.headers)
myresponse = urllib.request.urlopen(myreq)
html = myresponse.read().decode('utf-8')
return html
except urllib.request.URLError as e:
if(hasattr(e,'reason')):
print("未连接,原因:" + e.reason)
#获取问题页面
def get_content(self,html):
p_title = re.compile('.*?(.*?)',re.S)
countnum = 0
q_items = []
q_items = re.findall(p_title,html)
return q_items
def start(self):
url_download = 'https://iask.sina.com.cn/c/74.html'
html = self.get_html(url_download)
onepagedata = self.get_content(html)
q_dict = {}
for qq_dict in onepagedata:
q_dict['url'] = qq_dict[0]
q_dict['title'] = qq_dict[1]
self.q_db.insertdata(q_dict)
self.q_db.showdb()
self.q_db.closecur()
spider = IASK()
spider.start()