Browse Source

Scraping

pull/9/head
Jure Šorn 5 years ago
parent
commit
28a2d8e7f2
1 changed files with 34 additions and 0 deletions
  1. 34
      README.md

34
README.md

@ -1297,6 +1297,40 @@ from urllib.parse import quote, quote_plus, unquote, unquote_plus
```
Scraping
--------
```python
# $ pip3 install beautifulsoup4
from http.cookiejar import CookieJar
from urllib.error import HTTPError, URLError
from urllib.request import build_opener, HTTPCookieProcessor
from bs4 import BeautifulSoup
def scrape(url):
"""Returns tree of HTML elements located at URL."""
jar = CookieJar()
opener = build_opener(HTTPCookieProcessor(jar))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
try:
html = opener.open(url)
except ValueError as error:
return print(f'Malformed URL: {url}.\n{error}')
except (HTTPError, URLError) as error:
return print(f"Can't find URL: {url}.\n{error}")
return BeautifulSoup(html, 'html.parser')
```
```python
>>> document = scrape('https://en.wikipedia.org/wiki/Python_(programming_language)')
>>> table = document.find('table', class_='infobox vevent')
>>> rows = table.find_all('tr')
>>> rows[11].find('a')['href']
'https://www.python.org/'
>>> rows[6].find('div').text.split()[0]
'3.7.2'
```
Web
---
```python

Loading…
Cancel
Save