Scraping

6 years ago · 28a2d8e7f2
1 changed files with 34 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -1297,6 +1297,40 @@ from urllib.parse import quote, quote_plus, unquote, unquote_plus
 ```


+Scraping
+--------
+```python
+# $ pip3 install beautifulsoup4
+from http.cookiejar import CookieJar
+from urllib.error import HTTPError, URLError
+from urllib.request import build_opener, HTTPCookieProcessor
+from bs4 import BeautifulSoup
+
+def scrape(url):
+    """Returns tree of HTML elements located at URL."""
+    jar = CookieJar()
+    opener = build_opener(HTTPCookieProcessor(jar))
+    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+    try:
+        html = opener.open(url)
+    except ValueError as error:
+        return print(f'Malformed URL: {url}.\n{error}')
+    except (HTTPError, URLError) as error:
+        return print(f"Can't find URL: {url}.\n{error}")
+    return BeautifulSoup(html, 'html.parser')
+```
+
+```python
+>>> document = scrape('https://en.wikipedia.org/wiki/Python_(programming_language)')
+>>> table    = document.find('table', class_='infobox vevent')
+>>> rows     = table.find_all('tr')
+>>> rows[11].find('a')['href']
+'https://www.python.org/'
+>>> rows[6].find('div').text.split()[0]
+'3.7.2'
+```
+
+
 Web
 ---
 ```python