From 720b1e2b44dd25b40e661f5f60a6509182afb9b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jure=20=C5=A0orn?= Date: Thu, 27 Jun 2019 23:27:07 +0200 Subject: [PATCH] Scraping --- README.md | 38 ++++++++++++-------------------------- index.html | 35 ++++++++++++----------------------- 2 files changed, 24 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 36fa0dd..915f399 100644 --- a/README.md +++ b/README.md @@ -1911,33 +1911,19 @@ retention=|| Scraping -------- +#### Scrapes and prints Python's URL and version number from Wikipedia: ```python # $ pip3 install requests beautifulsoup4 ->>> import requests ->>> from bs4 import BeautifulSoup ->>> url = 'https://en.wikipedia.org/wiki/Python_(programming_language)' ->>> page = requests.get(url) ->>> doc = BeautifulSoup(page.text, 'html.parser') ->>> table = doc.find('table', class_='infobox vevent') ->>> rows = table.find_all('tr') ->>> link = rows[11].find('a')['href'] ->>> ver = rows[6].find('div').text.split()[0] ->>> link, ver -('https://www.python.org/', '3.7.2') -``` - -### Selenium -**Library for scraping dynamically generated web content.** - -```python -# $ brew cask install chromedriver -# $ pip3 install selenium ->>> from selenium import webdriver ->>> driver = webdriver.Chrome() ->>> driver.get(url) ->>> xpath = '//*[@id="mw-content-text"]/div/table[1]/tbody/tr[7]/td/div' ->>> driver.find_element_by_xpath(xpath).text.split()[0] -'3.7.2' +import requests +from bs4 import BeautifulSoup +url = 'https://en.wikipedia.org/wiki/Python_(programming_language)' +page = requests.get(url) +doc = BeautifulSoup(page.text, 'html.parser') +table = doc.find('table', class_='infobox vevent') +rows = table.find_all('tr') +link = rows[11].find('a')['href'] +ver = rows[6].find('div').text.split()[0] +print(link, ver) ``` @@ -2049,7 +2035,7 @@ from datetime import datetime time_str = datetime.now().strftime('%Y%m%d%H%M%S') filename = f'profile-{time_str}.png' drawer = output.GraphvizOutput(output_file=filename) -with PyCallGraph(output=drawer): +with PyCallGraph(drawer): ``` diff --git a/index.html b/index.html index 144bb44..0231f4b 100644 --- a/index.html +++ b/index.html @@ -1614,29 +1614,18 @@ logger.<level>('A logging message.')
  • '<str>' - Max age as a string: '1 week, 3 days', '2 months', …
  • #Scraping

    +

    Scrapes and prints Python's URL and version number from Wikipedia:

    # $ pip3 install requests beautifulsoup4
    ->>> import requests
    ->>> from bs4 import BeautifulSoup
    ->>> url   = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
    ->>> page  = requests.get(url)
    ->>> doc   = BeautifulSoup(page.text, 'html.parser')
    ->>> table = doc.find('table', class_='infobox vevent')
    ->>> rows  = table.find_all('tr')
    ->>> link  = rows[11].find('a')['href']
    ->>> ver   = rows[6].find('div').text.split()[0]
    ->>> link, ver
    -('https://www.python.org/', '3.7.2')
    -
    -

    Selenium

    -

    Library for scraping dynamically generated web content.

    -
    # $ brew cask install chromedriver
    -# $ pip3 install selenium
    ->>> from selenium import webdriver
    ->>> driver = webdriver.Chrome()
    ->>> driver.get(url)
    ->>> xpath  = '//*[@id="mw-content-text"]/div/table[1]/tbody/tr[7]/td/div'
    ->>> driver.find_element_by_xpath(xpath).text.split()[0]
    -'3.7.2'
    +import requests
    +from bs4 import BeautifulSoup
    +url   = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
    +page  = requests.get(url)
    +doc   = BeautifulSoup(page.text, 'html.parser')
    +table = doc.find('table', class_='infobox vevent')
    +rows  = table.find_all('tr')
    +link  = rows[11].find('a')['href']
    +ver   = rows[6].find('div').text.split()[0]
    +print(link, ver)
     

    #Web

    # $ pip3 install bottle
    @@ -1719,7 +1708,7 @@ Line #      Hits         Time  Per Hit   % Time  Line Contents
     time_str = datetime.now().strftime('%Y%m%d%H%M%S')
     filename = f'profile-{time_str}.png'
     drawer = output.GraphvizOutput(output_file=filename)
    -with PyCallGraph(output=drawer):
    +with PyCallGraph(drawer):
         <code_to_be_profiled>
     

    #NumPy