I am trying to scrape a page that is filled with javascript. The url is:
http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840
I have used the following code to get the data. Apparently this code should handle the javascript and return a complete html file but it does not. There might be an issue of timing and if so, I am not quite clear as to where you delay the proram to allow for a full html.
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def getHtml(str_url):
r_html = Render(str_url)
html = r_html.frame.toHtml()
return html
str_url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'
str_html = getHtml(str_url)
print(str_html)
This gives me the html that you would get if you ask for page source from a web browser. Of course there are more stuff on the page as all the tables are filled with javascript functions. Using Firebug, the id of the table I am looking for is "sharesInIndexTable. The items I really would like to scrape are the links under the name of each company - but having access to the whole table to parse with beautifulsoup would be even nicer. From this table, one should be able to find the word "Carlsberg" (as a potential test to see if the AJAX is fully loaded). Then I tried to figure out something to parse the DOM and I tried this:
import sys
from PyQt4 import QtGui, QtCore, QtWebKit
class Sp():
def printit(self):
data = self.webView.page().mainFrame().findFirstElement('id="sharesInIndexTable"')
print(data)
def main(self):
self.webView = QtWebKit.QWebView()
self.webView.load(QtCore.QUrl("http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840"))
QtCore.QObject.connect(self.webView,QtCore.SIGNAL("loadFinished(bool)"),self.printit)
app = QtGui.QApplication(sys.argv)
s = Sp()
s.main()
sys.exit(app.exec_())
All I get from this is PyQt4.QtWebkit.QWebElement object at 0x03294830 (your result may vary). Whatever I tried to put this address into readable format failed. This code also seems to run twice. Then I tried this one (somewhat adapted for my needs):
#!/usr/bin/python
# These lines will get us the modules we need.
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage, QWebView
class Scrape(QApplication):
def __init__(self):
# only work with ["test"] as it normally takes an array of args
super(Scrape, self).__init__(["test"])
# Create a QWebView instance and store it.
self.webView = QWebView()
# Connect our searchform method to the searchform signal of this new
# QWebView.
self.webView.loadFinished.connect(self.searchForm)
def load(self, url):
# In the __init__ we stored a QWebView instance into self.webView so
# we can load a url into it. It needs a QUrl instance though.
self.webView.load(QUrl(url))
def searchForm(self):
# We landed here because the load is finished. Now, load the root document
# element. It'll be a QWebElement instance. QWebElement is a QT4.6
# addition and it allows easier DOM interaction.
documentElement = self.webView.page().currentFrame().documentElement()
# Let's find the search input element.
print("Begin search")
inputSearch = documentElement.findFirst('id="sharesInIndexTable"')
# Disconnect ourselves from the signal.
self.webView.loadFinished.disconnect(self.searchForm)
print("End search")
# And connect the next function.
self.webView.loadFinished.connect(self.searchResults)
def searchResults(self):
# As seen above, first grab the root document element and then load all g
# classed list items.
print("Begin results")
results = self.webView.page().currentFrame().documentElement().findAll('td')
# Change the resulting QWebElementCollection into a list so we can easily
# iterate over it.
for e in results.toList():
# Just print the results.
print(e.tohtml())
# We are inside a QT application and need to terminate that properly.
print("End results")
self.exit()
# Instantiate our class.
my_scrape = Scrape()
# Load the Google homepage.
my_scrape.load('http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840')
# Start the QT event loop.
my_scrape.exec_()
I added the print() statement to figure out if the program was fully executing the commands. This produce nothing at all (except the print statements)
Inspecting the source page, I can find the script that fill the table it is:
var sharesInIndex = {
load: function () {
var index = webCore.getInstrument();
var nLabel = 'nm';
var hiddenAttributes = ",lists,tp,hlp,isin,note,";
var xslt = "inst_table.xsl";
var options = ",noflag,sectoridicon,";
var xpath = "//index//instruments";
// Check if swedish r�nteindex or Icelandic r�nteindex.
if ( index.indexOf('OMFSE') >= 0 || webCore.getInstrument().indexOf('IS00000') >= 0 ) {
hiddenAttributes += ",to,sectid,";
nLabel = 'fnm';
}
// Check if weights index present (typeof)
var shbindex = ",SE0002834820,SE0002834838,SE0002834846,SE0002977397,";
if ( shbindex.indexOf(index) >= 0 ) {
xslt = "inst_table_windex.xsl";
options += "windex,";
xpath = "//index";
}
var query = webCore.createQuery(
Utils.Constants.marketAction.getIndexInstrument, {
inst__a: "0,1,2,5,37,4,20,21,23,24,33,34,97,129,98,10", /* 87,*/
Instrument: index,
XPath: xpath,
ext_xslt: xslt,
ext_xslt_lang: currentLanguage,
ext_xslt_tableId: "sharesInIndexTable",
ext_xslt_hiddenattrs: hiddenAttributes,
ext_xslt_notlabel: nLabel,
ext_xslt_options: options
});
$("#sharesInIndexOutput").empty().loading("/static/nordic/css/img/loading.gif");
$("#sharesInIndexOutput").load( webCore.getProxyURL('prod'), {xmlquery: query},
function( responseText, textStatus, XMLHttpRequest) {
$("#sharesInIndexTable").tablesorter({
widgets: ['zebra'],
textExtraction: 'complex',
numberFormat: Utils.Constants.numberFormat[currentLanguage]
});
$("#sharesInIndexTable a").each( function() {
$(this).attr("href",webCore.getURL( Utils.Constants.pages.micrositeShare, $(this).attr('name') ));
});
});
}
};
$(document).ready( sharesInIndex.load );
I know there is an "execute_script" command but I don't know how yo implement it nor have I found any examples suitable for this - I don't mind if the result is Json or HTML or plain text. I believe this is where the answer will lie: (1) load the page, (2) run the script for the page, (3) get the results, (4) parse/print/save the results...
I would preferably have a headless solution if there is one and even Phantomjs on windows is not completely headless as it pops up a cmd window (I am aware that you can get rid of this with a virtual display on Linux - but that's is not the environment). Also, just telling me: oh you have to poll it to see if the data is loaded then you retrieve it is not very helpful: can you tell me (even in pseudo code) how a poll is done and more importantly roughly where in the program does that polling takes place (that's why I posted fully executable code - if someone else has the same problem they should have a complete and easy to understand answer).
My latest attemps (1 - insert a delay to allow the AJAX to load)
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
import time
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.mainFrame().load(QUrl(url))
self.loadFinished.connect(self._loadFinished)
self.app.exec_()
def _loadFinished(self, result):
time.sleep(5)
self.frame = self.currentFrame()
self.app.quit()
url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'
r = Render(url)
html = r.frame.toHtml()
print(html)
(2 - polling for a known item in the source page) - item found with firebug inspector - maybe the syntax is wrong for the argument of findFirst.
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
import time
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.mainFrame().load(QUrl(url))
self.loadFinished.connect(self._loadFinished)
self.app.exec_()
def _loadFinished(self, result):
counter = 0
while(self.mainFrame().documentElement().findFirst("id=sharesInIndexTable")):
counter+=1
print(counter)
time.sleep(1)
self.frame = self.currentFrame()
self.app.quit()
url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'
r = Render(url)
html = r.frame.toHtml()
print(html)
This last one has a counter to show if something is happening. It counts forever and has to be stopped with ctrl-c.
(3 - Another variant using WebElement)
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
import time
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.mainFrame().load(QUrl(url))
self.loadFinished.connect(self._loadFinished)
self.app.exec_()
def _loadFinished(self, result):
table = self.mainFrame().documentElement().findFirst("id=sharesInIndexTable")
print(table) #prints: <PyQt4.QtWebKit.QWebElement object at 0x0319FB0>
print("Attributes:")
print(table.attributeNames()) #prints: [] i.e. None
print("Classes: ")
print(table.classes()) #prints: [] i.e. None
print("InnerXML: " + table.toInnerXml()) #prints nothing
print("OuterXML: " + table.toOuterXml()) #prints nothing
print("Done")
self.frame = self.currentFrame()
self.app.quit()
url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'
r = Render(url)
html = r.frame.toHtml()
No success either with this one. I put in the code what was printed. There is apparently an object there but I cannot see what is inside.