executing the script of a webpage with Python

Question

I am trying to scrape a page that is filled with javascript. The url is:

http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840

I have used the following code to get the data. Apparently this code should handle the javascript and return a complete html file but it does not. There might be an issue of timing and if so, I am not quite clear as to where you delay the proram to allow for a full html.

import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *

class Render(QWebPage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)
        self.loadFinished.connect(self._loadFinished)
        self.mainFrame().load(QUrl(url))
        self.app.exec_()

    def _loadFinished(self, result):
        self.frame = self.mainFrame()
        self.app.quit()

def getHtml(str_url):
    r_html = Render(str_url)
    html = r_html.frame.toHtml()
    return html

str_url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'
str_html = getHtml(str_url)
print(str_html)

This gives me the html that you would get if you ask for page source from a web browser. Of course there are more stuff on the page as all the tables are filled with javascript functions. Using Firebug, the id of the table I am looking for is "sharesInIndexTable. The items I really would like to scrape are the links under the name of each company - but having access to the whole table to parse with beautifulsoup would be even nicer. From this table, one should be able to find the word "Carlsberg" (as a potential test to see if the AJAX is fully loaded). Then I tried to figure out something to parse the DOM and I tried this:

import sys
from PyQt4 import QtGui, QtCore, QtWebKit

class Sp():
    def printit(self):        
        data = self.webView.page().mainFrame().findFirstElement('id="sharesInIndexTable"')
    print(data)       

def main(self):
    self.webView = QtWebKit.QWebView()
    self.webView.load(QtCore.QUrl("http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840"))
    QtCore.QObject.connect(self.webView,QtCore.SIGNAL("loadFinished(bool)"),self.printit)

    app = QtGui.QApplication(sys.argv)
    s = Sp()
    s.main()
    sys.exit(app.exec_())

All I get from this is PyQt4.QtWebkit.QWebElement object at 0x03294830 (your result may vary). Whatever I tried to put this address into readable format failed. This code also seems to run twice. Then I tried this one (somewhat adapted for my needs):

#!/usr/bin/python

# These lines will get us the modules we need.
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage, QWebView

class Scrape(QApplication):
  def __init__(self):
  # only work with ["test"] as it normally takes an array of args
  super(Scrape, self).__init__(["test"])
  # Create a QWebView instance and store it.
  self.webView = QWebView()
  # Connect our searchform method to the searchform signal of this new
  # QWebView.
  self.webView.loadFinished.connect(self.searchForm)

  def load(self, url):
  # In the __init__ we stored a QWebView instance into self.webView so
  # we can load a url into it. It needs a QUrl instance though.
  self.webView.load(QUrl(url))

  def searchForm(self):
  # We landed here because the load is finished. Now, load the root document
  # element. It'll be a QWebElement instance. QWebElement is a QT4.6
  # addition and it allows easier DOM interaction.
  documentElement = self.webView.page().currentFrame().documentElement()
  # Let's find the search input element.
  print("Begin search")
  inputSearch = documentElement.findFirst('id="sharesInIndexTable"')
  # Disconnect ourselves from the signal.
  self.webView.loadFinished.disconnect(self.searchForm)
  print("End search")
  # And connect the next function.
  self.webView.loadFinished.connect(self.searchResults)

  def searchResults(self):
  # As seen above, first grab the root document element and then load all g
  # classed list items.
  print("Begin results")
  results = self.webView.page().currentFrame().documentElement().findAll('td')

  # Change the resulting QWebElementCollection into a list so we can easily
  # iterate over it.
  for e in results.toList():
    # Just print the results.
    print(e.tohtml())
  # We are inside a QT application and need to terminate that properly.
  print("End results")
  self.exit()

# Instantiate our class.
my_scrape = Scrape()
# Load the Google homepage.
my_scrape.load('http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840')
# Start the QT event loop.
my_scrape.exec_()

I added the print() statement to figure out if the program was fully executing the commands. This produce nothing at all (except the print statements)

Inspecting the source page, I can find the script that fill the table it is:

var sharesInIndex = { 
load: function () {
var index = webCore.getInstrument();
var nLabel = 'nm';
var hiddenAttributes = ",lists,tp,hlp,isin,note,";
var xslt = "inst_table.xsl";
var options =  ",noflag,sectoridicon,";
var xpath = "//index//instruments";
// Check if swedish r�nteindex or Icelandic r�nteindex.
if ( index.indexOf('OMFSE') >= 0 || webCore.getInstrument().indexOf('IS00000') >= 0 ) {
    hiddenAttributes += ",to,sectid,";
    nLabel = 'fnm';
}

// Check if weights index present (typeof)
var shbindex = ",SE0002834820,SE0002834838,SE0002834846,SE0002977397,";
if ( shbindex.indexOf(index) >= 0 ) {
    xslt = "inst_table_windex.xsl";
    options += "windex,";
    xpath = "//index";
}

var query = webCore.createQuery(
    Utils.Constants.marketAction.getIndexInstrument, {
    inst__a: "0,1,2,5,37,4,20,21,23,24,33,34,97,129,98,10", /* 87,*/
    Instrument: index,
    XPath: xpath,
    ext_xslt: xslt,
    ext_xslt_lang: currentLanguage,
    ext_xslt_tableId: "sharesInIndexTable",
    ext_xslt_hiddenattrs: hiddenAttributes,
    ext_xslt_notlabel: nLabel,
    ext_xslt_options: options
  });

  $("#sharesInIndexOutput").empty().loading("/static/nordic/css/img/loading.gif");
  $("#sharesInIndexOutput").load( webCore.getProxyURL('prod'), {xmlquery: query},
    function( responseText, textStatus, XMLHttpRequest) {
      $("#sharesInIndexTable").tablesorter({
        widgets: ['zebra'], 
        textExtraction: 'complex', 
        numberFormat: Utils.Constants.numberFormat[currentLanguage]
        });
      $("#sharesInIndexTable a").each( function() {
        $(this).attr("href",webCore.getURL( Utils.Constants.pages.micrositeShare, $(this).attr('name') ));
      });
    });
  }
};

$(document).ready( sharesInIndex.load );

I know there is an "execute_script" command but I don't know how yo implement it nor have I found any examples suitable for this - I don't mind if the result is Json or HTML or plain text. I believe this is where the answer will lie: (1) load the page, (2) run the script for the page, (3) get the results, (4) parse/print/save the results...

I would preferably have a headless solution if there is one and even Phantomjs on windows is not completely headless as it pops up a cmd window (I am aware that you can get rid of this with a virtual display on Linux - but that's is not the environment). Also, just telling me: oh you have to poll it to see if the data is loaded then you retrieve it is not very helpful: can you tell me (even in pseudo code) how a poll is done and more importantly roughly where in the program does that polling takes place (that's why I posted fully executable code - if someone else has the same problem they should have a complete and easy to understand answer).

My latest attemps (1 - insert a delay to allow the AJAX to load)

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *
import time

class Render(QWebPage):  
  def __init__(self, url):  
    self.app = QApplication(sys.argv)  
    QWebPage.__init__(self)
    self.mainFrame().load(QUrl(url))  
    self.loadFinished.connect(self._loadFinished)   
    self.app.exec_()  

  def _loadFinished(self, result):
    time.sleep(5)
    self.frame = self.currentFrame()  
    self.app.quit()  

url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'  
r = Render(url)  
html = r.frame.toHtml()
print(html)

(2 - polling for a known item in the source page) - item found with firebug inspector - maybe the syntax is wrong for the argument of findFirst.

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *
import time

class Render(QWebPage):  
  def __init__(self, url):  
    self.app = QApplication(sys.argv)  
    QWebPage.__init__(self)
    self.mainFrame().load(QUrl(url))  
    self.loadFinished.connect(self._loadFinished)   
    self.app.exec_()  

  def _loadFinished(self, result):
    counter = 0
    while(self.mainFrame().documentElement().findFirst("id=sharesInIndexTable")):
      counter+=1
      print(counter)
      time.sleep(1)    
    self.frame = self.currentFrame()  
    self.app.quit()  

url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'  
r = Render(url)  
html = r.frame.toHtml()
print(html)

This last one has a counter to show if something is happening. It counts forever and has to be stopped with ctrl-c.

(3 - Another variant using WebElement)

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *
import time

class Render(QWebPage):  
  def __init__(self, url):  
    self.app = QApplication(sys.argv)  
    QWebPage.__init__(self)
    self.mainFrame().load(QUrl(url))  
    self.loadFinished.connect(self._loadFinished)   
    self.app.exec_()  

  def _loadFinished(self, result):
    table = self.mainFrame().documentElement().findFirst("id=sharesInIndexTable")
    print(table)    #prints: <PyQt4.QtWebKit.QWebElement object at 0x0319FB0>
    print("Attributes:")
    print(table.attributeNames())    #prints: [] i.e. None 
    print("Classes: ")
    print(table.classes())      #prints: [] i.e. None
    print("InnerXML: " + table.toInnerXml())   #prints nothing
    print("OuterXML: " + table.toOuterXml())   #prints nothing
    print("Done")
    self.frame = self.currentFrame()  
    self.app.quit()  

url = 'http://www.nasdaqomxnordic.com/index/index_info?Instrument=DK0016268840'  
r = Render(url)  
html = r.frame.toHtml()

No success either with this one. I put in the code what was printed. There is apparently an object there but I cannot see what is inside.

Abhimanu Kumar · Accepted Answer · 2014-12-18 21:09:51Z

1

I know it's been long time but this answer is for later visitors in a similar situation

I was hitting a similar problem and I tried various things such as waiting for signals from loadFinished from QWebPage as well as QWebFrame, waiting for signal from QWebFrame.intialLayoutCompleted() etc.

The thing that finally worked for me is this:

I just rendered the page in a normal browser. Inspected the element that was not getting rendered in PyQt due to javascript, got the id for that element (if it is a div that in turn contains multiple elements, tables etc., then get the div id). Now in the python code in the yourPage.loadFinished function call yourFrame.evaluateJavaScript("document.getElementById(element_id_retrieved_earlier')").

And this will wait for the id to be retrieved which in turn will wait for the embedded script to be executed.

answered Dec 18, 2014 at 21:09

Abhimanu Kumar

1,79118 silver badges20 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Eric Over a year ago

I tried to implement your solution with euronext.com/en/products/equities/NL0000352565-XAMS/… and elementID = 'instrument-factsheet' but I must not be doing this right. Could you please post some code snippet of your method. Thanks.

Collectives™ on Stack Overflow

executing the script of a webpage with Python

1 Answer 1

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related