From 2b874a556c92db0f76755d10e9b16616af1450dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jure=20=C5=A0orn?= Date: Sun, 27 Dec 2020 07:35:12 +0100 Subject: [PATCH] Scraping --- README.md | 12 +++++------- index.html | 12 +++++------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index e4b3c86..f2efaa0 100644 --- a/README.md +++ b/README.md @@ -2445,17 +2445,15 @@ Scraping #### Scrapes Python's URL, version number and logo from its Wikipedia page: ```python # $ pip3 install requests beautifulsoup4 -import requests, sys -from bs4 import BeautifulSoup +import requests, bs4, sys URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)' try: html = requests.get(URL).text - doc = BeautifulSoup(html, 'html.parser') + doc = bs4.BeautifulSoup(html, 'html.parser') table = doc.find('table', class_='infobox vevent') - rows = table.find_all('tr') - link = rows[11].find('a')['href'] - ver = rows[6].find('div').text.split()[0] - url_i = rows[0].find('img')['src'] + link = table.find('th', text='Website').next_sibling.a['href'] + ver = table.find('th', text='Stable release').next_sibling.strings.__next__() + url_i = table.find('img')['src'] image = requests.get(f'https:{url_i}').content with open('test.png', 'wb') as file: file.write(image) diff --git a/index.html b/index.html index ab816e1..9a75eb7 100644 --- a/index.html +++ b/index.html @@ -2140,17 +2140,15 @@ logger.<level>('A logging message.')
  • '<str>' - Max age as a string: '1 week, 3 days', '2 months', …
  • #Scraping

    Scrapes Python's URL, version number and logo from its Wikipedia page:

    # $ pip3 install requests beautifulsoup4
    -import requests, sys
    -from bs4 import BeautifulSoup
    +import requests, bs4, sys
     URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
     try:
         html  = requests.get(URL).text
    -    doc   = BeautifulSoup(html, 'html.parser')
    +    doc   = bs4.BeautifulSoup(html, 'html.parser')
         table = doc.find('table', class_='infobox vevent')
    -    rows  = table.find_all('tr')
    -    link  = rows[11].find('a')['href']
    -    ver   = rows[6].find('div').text.split()[0]
    -    url_i = rows[0].find('img')['src']
    +    link  = table.find('th', text='Website').next_sibling.a['href']
    +    ver   = table.find('th', text='Stable release').next_sibling.strings.__next__()
    +    url_i = table.find('img')['src']
         image = requests.get(f'https:{url_i}').content
         with open('test.png', 'wb') as file:
             file.write(image)