diff --git a/README.md b/README.md index 197356d..3e03c42 100644 --- a/README.md +++ b/README.md @@ -1297,6 +1297,40 @@ from urllib.parse import quote, quote_plus, unquote, unquote_plus ``` +Scraping +-------- +```python +# $ pip3 install beautifulsoup4 +from http.cookiejar import CookieJar +from urllib.error import HTTPError, URLError +from urllib.request import build_opener, HTTPCookieProcessor +from bs4 import BeautifulSoup + +def scrape(url): + """Returns tree of HTML elements located at URL.""" + jar = CookieJar() + opener = build_opener(HTTPCookieProcessor(jar)) + opener.addheaders = [('User-agent', 'Mozilla/5.0')] + try: + html = opener.open(url) + except ValueError as error: + return print(f'Malformed URL: {url}.\n{error}') + except (HTTPError, URLError) as error: + return print(f"Can't find URL: {url}.\n{error}") + return BeautifulSoup(html, 'html.parser') +``` + +```python +>>> document = scrape('https://en.wikipedia.org/wiki/Python_(programming_language)') +>>> table = document.find('table', class_='infobox vevent') +>>> rows = table.find_all('tr') +>>> rows[11].find('a')['href'] +'https://www.python.org/' +>>> rows[6].find('div').text.split()[0] +'3.7.2' +``` + + Web --- ```python