Created
June 18, 2016 05:21
-
-
Save brbsix/c71d8f8643edbb23d0a86ec991ac4acc to your computer and use it in GitHub Desktop.
PyQt5 Scraper (Basic Example)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Sample scraper script | |
See: https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/ | |
""" | |
# standard imports | |
import sys | |
# third-party imports | |
import requests | |
from bs4 import BeautifulSoup | |
from pyvirtualdisplay import Display | |
from PyQt5.QtWebKitWidgets import QWebPage | |
from PyQt5.QtWidgets import QApplication | |
class Render(QWebPage): | |
"""Render HTML with PyQt5 WebKit.""" | |
def __init__(self, html): | |
self.html = None | |
self.app = QApplication(sys.argv) | |
QWebPage.__init__(self) | |
self.loadFinished.connect(self._loadFinished) | |
self.mainFrame().setHtml(html) | |
self.app.exec_() | |
def _loadFinished(self, result): | |
self.html = self.mainFrame().toHtml() | |
self.app.quit() | |
url = 'https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/' | |
# get the raw HTML | |
source_html = requests.get(url).text | |
# return the JavaScript rendered HTML | |
with Display(visible=0, size=(800, 600)): | |
rendered_html = Render(source_html).html | |
# get the BeautifulSoup | |
soup = BeautifulSoup(rendered_html, 'html.parser') | |
print('title is %r' % soup.select_one('title').text) |
I am but a noob but im almost sure QtWebKit for pyqt5 is deprecated? or removed entirely, should be using the web engine widget! @JoeDevlin
he's right. also see here: http://doc.qt.io/qt-5/qtwebenginewidgets-qtwebkitportingguide.html
you can solve this problem by working with python3 and run this pip : pip3 install PyQtWebEngine , it handle every dependencies , So you do not need to install PyQt5 then deal with the PyQtWebEngine installation .
I use Linux Mint worked for me .
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I have just copied this into PyCharm and am getting the following - ImportError: No module named 'PyQt5.QtWebKitWidgets' - do you know wh that might by?