各位大神
我在抓取一个全是js加载的网站数据时,通过phantomjs解析得到html响应在返回lxml对象时报错.
网站是: http://data.champdas.com/match/scheduleDetail-1-2017-6.html
我的spider是:
import scrapy
from gooseeker import GsExtractor
class CslsaiguoSpider(scrapy.Spider):
name = "cslsaiguo"
allowed_domains = ["champdas.com"]
start_urls = (
'http://data.champdas.com/match/data-2772.html',
)
def parse(self, response):
html = response.body
# html = response.body.decode('gbk').encode().decode('utf-8')
print(response.body.decode('gbk').encode().decode('utf8'))
print("----------------------------------------------------------------------------")
extra=GsExtractor()
extra.setXsltFromAPI("356e75d4d09d77a2b60a058d77047847", "cslsaiguo")
result = extra.extractHTML(html)
print(result)
file_name = 'C:/Users/deryk/csl/temp/cslsaiguo/1.xml'
open(file_name,"wb").write(result)
gooseek.py中代码是
import time
from urllib import request
from urllib.parse import quote
from lxml import etree
class GsExtractor(object):
def _init_(self):
self.xslt = ""
# 从文件读取xslt
def setXsltFromFile(self , xsltFilePath):
file = open(xsltFilePath , 'r' , encoding='UTF-8')
try:
self.xslt = file.read()
finally:
file.close()
# 从字符串获得xslt
def setXsltFromMem(self , xsltStr):
self.xslt = xsltStr
# 通过GooSeeker API接口获得xslt
def setXsltFromAPI(self , APIKey , theme, middle=None, bname=None):
apiurl = "http://www.gooseeker.com/api/getextractor?key="+ APIKey +"&theme="+quote(theme)
if (middle):
apiurl = apiurl + "&middle="+quote(middle)
if (bname):
apiurl = apiurl + "&bname="+quote(bname)
apiconn = request.urlopen(apiurl)
self.xslt = apiconn.read()
# 返回当前xslt
def getXslt(self):
return self.xslt
# 提取方法,入参是一个HTML DOM对象,返回是提取结果
def extract(self , html):
xslt_root = etree.XML(self.xslt)
transform = etree.XSLT(xslt_root)
result_tree = transform(html)
return result_tree
# 提取方法,入参是html源码,返回是提取结果
def extractHTML(self , html):
doc = etree.HTML(html)
return self.extract(doc)
报的错误是
Traceback (most recent call last):
File "c:\python36\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\Deryk\csl\sprots\sprots\spiders\cslsaiguo.py", line 24, in parse
result = extra.extractHTML(html)
File "C:\Users\Deryk\csl\sprots\gooseeker.py", line 50, in extractHTML
return self.extract(doc)
File "C:\Users\Deryk\csl\sprots\gooseeker.py", line 44, in extract
transform = etree.XSLT(xslt_root)
File "src\lxml\xslt.pxi", line 409, in lxml.etree.XSLT.__init__ (src\lxml\lxml.etree.c:177760)
lxml.etree.XSLTParseError: Failed to compile predicate
|
|
|
|
|
共 8 个关于本帖的回复 最后回复于 2017-4-21 19:23