Browse Source

Scraping

pull/9/head
Jure Šorn 5 years ago
parent
commit
3d1516dde9
1 changed files with 5 additions and 22 deletions
  1. 27
      README.md

27
README.md

@ -1328,28 +1328,11 @@ from urllib.parse import quote, quote_plus, unquote, unquote_plus
Scraping
--------
```python
# $ pip3 install beautifulsoup4
from http.cookiejar import CookieJar
from urllib.error import HTTPError, URLError
from urllib.request import build_opener, HTTPCookieProcessor
from bs4 import BeautifulSoup
def scrape(url):
"""Returns tree of HTML elements located at URL."""
jar = CookieJar()
opener = build_opener(HTTPCookieProcessor(jar))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
try:
html = opener.open(url)
except ValueError as error:
return print(f'Malformed URL: {url}.\n{error}')
except (HTTPError, URLError) as error:
return print(f"Can't find URL: {url}.\n{error}")
return BeautifulSoup(html, 'html.parser')
```
```python
>>> document = scrape('https://en.wikipedia.org/wiki/Python_(programming_language)')
# $ pip3 install requests beautifulsoup4
>>> import requests
>>> from bs4 import BeautifulSoup
>>> page = requests.get('https://en.wikipedia.org/wiki/Python_(programming_language)')
>>> document = BeautifulSoup(page.text, 'html.parser')
>>> table = document.find('table', class_='infobox vevent')
>>> rows = table.find_all('tr')
>>> website = rows[11].find('a')['href']

Loading…
Cancel
Save