<table>
      <tr> <!-- begin header (first) row -->
        <th>Heading 1</ht> <!-- column 1 heading -->
        <th>Heading 2</ht> <!-- column 2 heading -->
      </tr>
      <tr> <!-- begin second row -->
        <td>Row 1, Column 1</td>
        <td>Row 1, Column 2</td>
      </tr>
        <tr> <!-- begin second row -->
        <td>Row 2, Column 1</td>
        <td>Row 2, Column 2</td>
      </tr>
</table>

<?xml version="1.0" encoding="UTF-8"?>
<library>
    <book id="b001" status="available">
        <title>The Great Gatsby</title>
        <author>
            <firstName>F. Scott</firstName>
            <lastName>Fitzgerald</lastName>
        </author>
        <publicationYear>1925</publicationYear>
        <isbn>978-0-7432-7356-5</isbn>
        <genres>
            <genre>Fiction</genre>
            <genre>Classic</genre>
        </genres>
    </book>
    <book id="b002" status="checked-out">
        <title>1984</title>
        <author>
            <firstName>George</firstName>
            <lastName>Orwell</lastName>
        </author>
        <publicationYear>1949</publicationYear>
        <isbn>978-0-452-28423-4</isbn>
        <genres>
            <genre>Dystopian</genre>
            <genre>Science Fiction</genre>
        </genres>
    </book>
</library>

import requests
import bs4

url = "https://facultyweb.cs.wwu.edu/~wehrwes/courses/data311_25f/"
response = requests.get(url)
print(response.text[:500])

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
  <meta charset="utf-8" />
  <meta name="generator" content="pandoc" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
  <meta name="author" content="Scott Wehrwein" />
  <title>DATA 311 - Fundamentals of Data Science</title>
  <style>
    code{white-space: pre-wrap;}
    span.smallcaps{font-variant: small-caps;}
    div.columns{display: flex; gap: min(4vw, 1.5em)

soup = bs4.BeautifulSoup(response.text, 'html.parser')

# get the first link in the document (<a> tag)
soup.a

<a href="#course-overview" id="toc-course-overview">Course
Overview</a>

# get the first h1 tag
soup.h1

<h1 class="title">DATA 311 - Fundamentals of Data Science</h1>

# get the text inside from the h1 element we found above
soup.h1.text

'DATA 311 - Fundamentals of Data Science'

# get the href attribute of the a element we found above
soup.a["href"]

'#course-overview'

# use find to get the first a, equivalent to soup.a
soup.find("a")

<a href="#course-overview" id="toc-course-overview">Course
Overview</a>

# find the first table (which is the Schedule table)
schedule = soup.find("table")

# search the table for the first row with class = "odd"
schedule.find("tr", class_="odd")

<tr class="odd">
<td>09/24 (0)</td>
<td>Introduction and overview<br/>What is data science? What is data?
<br/><a href="lectures/L00/L00_slides.pdf">slides</a><br/><a href="lectures/L00/L00.html">typed notes</a><br/><a href="lectures/L00/W00.html">worksheet</a><br/><a href="lectures/L00/L00.pdf">whiteboard</a></td>
<td>Start of Quarter Survey (Canvas)</td>
<td>1.1, 1.3</td>
</tr>

soup.ul.a.text

'Course\nOverview'

# find the h2 with id="course-policies" with the `id` attribute to the find method
soup.find("h2", id="course-policies")

<h2 id="course-policies">Course Policies</h2>

# use find with a dict of attributes passed to the attrs kwarg
# we can even do this without specifying the type of tag!
soup.find(attrs={"href": "#course-policies"})

<a href="#course-policies" id="toc-course-policies">Course
Policies</a>

# find all tr elements of the first table element in the document
len(soup.find_all("tr"))

30

len(soup.find_all("tr", attrs={"class": ["odd", "even"]}))

29

[tag.text for tag in soup.nav.find_all("a")]

['Course\nOverview',
 'Assessment',
 'Resources',
 'Logistics',
 'Schedule',
 'Course\nPolicies']

url = "https://cs.wwu.edu/faculty"
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')

L11 - Data Collection and Structured Data 1¶

HTML, XML, and Web Scraping¶

Announcements:¶

Goals:¶

HTML - HyperText Markup Language¶

Basic Elements¶

Heading 1

Heading 2

Heading 6

XML¶

Web Scraping¶

So you want some data, but you can only find it buried in some webpage.¶

Demo¶

Heading 1	Heading 2
Row 1, Column 1	Row 1, Column 2
Row 2, Column 1	Row 2, Column 2