加入downloadmiddleware
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
from
scrapy
import
signals
from
scrapy
.
downloadermiddlewares
.
useragent
import
UserAgentMiddleware
import
random
import
pyppeteer
import
asyncio
import
os
from
scrapy
.
http
import
HtmlResponse
pyppeteer
.
DEBUG
=
False
class
FundscrapyDownloaderMiddleware
(
object
)
:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def
__init__
(
self
)
:
print
(
"Init downloaderMiddleware use pypputeer."
)
os
.
environ
[
'PYPPETEER_CHROMIUM_REVISION'
]
=
'588429'
# pyppeteer.DEBUG = False
print
(
os
.
environ
.
get
(
'PYPPETEER_CHROMIUM_REVISION'
)
)
loop
=
asyncio
.
get_event_loop
(
)
task
=
asyncio
.
ensure_future
(
self
.
getbrowser
(
)
)
loop
.
run_until_complete
(
task
)
#self.browser = task.result()
print
(
self
.
browser
)
print
(
self
.
page
)
# self.page = await browser.newPage()
async
def
getbrowser
(
self
)
:
self
.
browser
=
await
pyppeteer
.
launch
(
)
self
.
page
=
await
self
.
browser
.
newPage
(
)
# return await pyppeteer.launch()
async
def
getnewpage
(
self
)
:
return
await
self
.
browser
.
newPage
(
)
@
classmethod
def
from_crawler
(
cls
,
crawler
)
:
# This method is used by Scrapy to create your spiders.
s
=
cls
(
)
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
)
:
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
loop
=
asyncio
.
get_event_loop
(
)
task
=
asyncio
.
ensure_future
(
self
.
usePypuppeteer
(
request
)
)
loop
.
run_until_complete
(
task
)
# return task.result()
return
HtmlResponse
(
url
=
request
.
url
,
body
=
task
.
result
(
)
,
encoding
=
"utf-8"
,
request
=
request
)
async
def
usePypuppeteer
(
self
,
request
)
:
print
(
request
.
url
)
# page = await self.browser.newPage()
await
self
.
page
.
goto
(
request
.
url
)
content
=
await
self
.
page
.
content
(
)
return
content
def
process_response
(
self
,
request
,
response
,
spider
)
:
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
)
:
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
)
:
spider
.
logger
.
info
(
'Spider opened: %s'
%
spider
.
name
)
|