How to get webpage using javascript?

Question

I want to get whole list of pdf links in the below url page: 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'

The problem is that the webpage uses javascript internally to show the links, and I could not get the pdf links.

Actually, I tried to parse with various ways found through googling. But I failed. Can you suggest the proper way to solve the problem ?

The below is the code I tried but failed:

def crawle_kiwoom_mletter():
    if not os.path.exists(dir_output_mletter):
        os.makedirs(dir_output_mletter)

    #urlformat = 'https://www.kiwoom.com/nkw.template.do?m=m0601010101&s_menu=ML&s_sqno=4784'
    urlformat = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'

    index = -1
    while True:
        index = index + 1
        url = urlformat.format(index)
        print('processing {}...'.format(url))
        page = urllib.request.urlopen(url)

        soup = BeautifulSoup(page, 'lxml')

        #print_anchors(soup)

        print(soup.prettify())
        '''
        if browse_mbriefing_linkpages(soup) == False:
            break
        '''
        break

'''
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/
'''

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *  
from lxml import html 

class Render(QWebPage):  
  def __init__(self, url):  
    self.app = QApplication(sys.argv)  
    QWebPage.__init__(self)  
    self.loadFinished.connect(self._loadFinished)  
    self.mainFrame().load(QUrl(url))  
    self.app.exec_()  

  def _loadFinished(self, result):  
    self.frame = self.mainFrame()  
    self.app.quit() 


def crawl_kiwoom_mletter2():
    url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
    url='http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000&amp;source=&amp;xdr='
    #This does the magic.Loads everything
    r = Render(url)  
    #result is a QString.
    result = r.frame.toHtml()

    print(result)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

'''
http://stackoverflow.com/questions/28289699/python-web-scraping-for-javascript-generated-content
'''    
def crawl_kiwoom_mletter3():

    browser = webdriver.Firefox()
    url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
    browser.get(url)
    res = browser.page_source

    print(res)

    driver.close()

So what exactly are you trying to get? The links for the "Morning Letters" don't actually have links, they are ajax calls to update the page with the pdf. Are you trying to download the pdfs? Or reference the html page for the links? — Morgan G
– Morgan G, Commented Feb 21, 2016 at 6:41
@MorganG I'm trying to download all the pdfs I haven't downloaded yet. thanks to Kenavoz's code, I could download several pdfs. But I want to know how to analyze this kind of webpage in order to make a periodical downloader tool. — user1913171
– user1913171, Commented Feb 21, 2016 at 17:42

SLePort · Accepted Answer · 2016-02-21 18:19:42Z

0

Try this code using python2 and BeautifulSoup4 :

from bs4 import BeautifulSoup
import re
import urllib, urllib2

def browse(page):
    url = 'http://bbn.kiwoom.com/bbn.marketConditionMLList.do'
    values = {
    'pageno': page,
    'basePath': '4',
    's_startdate': '20120822',
    's_enddate': '20200222',
    }

    data = urllib.urlencode(values)
    req = urllib2.Request(url, data)
    page=urllib2.urlopen(req)
    soup = BeautifulSoup(page.read())

    aTagAll = soup.find_all('a', {'class': 'file'})

    for aTag in aTagAll:
        downloadFile(getParams( aTag ))
    page+=1

def getParams(aTag):
    params = {}
    m = re.search(r"openFile\('([^']*)','([^']*)','([^']*)", aTag['onclick'])
    params['realname'] = m.group(1)
    params['filename'] = m.group(2)
    params['snMakedate'] = m.group(3)
    return params

def downloadFile(params):
  print 'Downloading : %s' % params['filename']
  url = 'http://bbn.kiwoom.com/bbn.fileDownload.do'
  values = {
    's_realname': params['realname'],
    's_filename': params['filename'], 
    's_snMakedate': params['snMakedate'], 
        'pageno': '8',
    'basePath': '4'
  }
  data = urllib.urlencode(values)

  req = urllib2.Request(url, data)
  try: response = urllib2.urlopen(req)
  except urllib2.HTTPError as e:
    print e.code 
    print e.read()

  file = open(params['filename'], 'w')
  file.write(response.read())
  file.close()

for pagenum in range(1, 58):
    browse(page=pagenum)

It get all links from the pdf list page and parse them with the getParams function.

The params and an additional basePath param are sent to the download url using urllib2 python module.

I suggest you to add a delay between each request to prevent overloading the server.

UPDATE :

It now browses pages from 1 to 58 (actual number of pages) and parse all links.

edited Feb 21, 2016 at 18:19

answered Feb 21, 2016 at 8:33

SLePort

15.5k3 gold badges40 silver badges45 bronze badges

Sign up to request clarification or add additional context in comments.

3 Comments

user1913171 Over a year ago

Thank you for your support. I can download linked pdfs using your code after changing open parameter from 'w' to 'wb'. I have two questions. 1. I could not find any links such as 'bbn.kiwoom.com/bbn.marketConditionMLList.do' or 'bbn.kiwoom.com/bbn.fileDownload.do' you noted in your code. How could you find those ? 2. The original objective is not to download pdfs just in the current page but to download all of the morining letter pdfs surfing through the linked pages. How can I move to the next page and get another pdf lists ?

SLePort Over a year ago

bbn.kiwoom.com/bbn.marketConditionMLList.do is the src of the frame where the pdf are actually listed. On this page, bbn.kiwoom.com/bbn.fileDownload.do is the action url of the form used to download files. You can request bbn.kiwoom.com/bbn.marketConditionMLList.do with urlib2 + param pageno to scrap next pages.

user1913171 Over a year ago

Could you let me know how to get the final result of the webpage in which urls starts with 'bbin.kiwoon.com/' you noted are included ? I still have trouble with getting the whole list of links. Thank you.

Collectives™ on Stack Overflow

How to get webpage using javascript?

1 Answer 1

3 Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

3 Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related