diff options
| author | Sophie Forrest <git@sophieforrest.com> | 2024-09-10 08:53:41 +1200 |
|---|---|---|
| committer | Sophie Forrest <git@sophieforrest.com> | 2024-09-10 08:53:41 +1200 |
| commit | 08d060660f2f6bd678770026e109e0cd7429e6f1 (patch) | |
| tree | 0769a5b614be7c1b5a1a3c44da7c00999c759f63 /src/lib.rs | |
| parent | d0256057ac4e2d18f30a7cd5845d315f3167ac67 (diff) | |
refactor: move parsing into separate function
Preparation for adding download capabilities.
Diffstat (limited to '')
| -rw-r--r-- | src/lib.rs | 74 |
1 files changed, 57 insertions, 17 deletions
diff --git a/src/lib.rs b/src/lib.rs index 38af605..3dc8e99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,9 +5,12 @@ //! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly //! parse prerequisites, however. -use std::{collections::HashSet, fmt}; +use std::{ + collections::{HashMap, HashSet}, + fmt, +}; -use scraper::ElementRef; +use scraper::{CaseSensitivity, ElementRef, Html, Selector}; use serde::{Deserialize, Serialize}; use tracing::{debug, info}; @@ -269,20 +272,57 @@ impl TryFrom<&str> for Trimester { } } -impl fmt::Display for Course<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}", - self.title, - self.subtitle, - self.offered, - self.subject_areas - .iter() - // Necessary as Rust refuses to build Vec<&str> with &String. - .map(|s| &**s) - .collect::<Vec<&str>>() - .join(", "), - ) +/// Parses a [`Html`] document into a [`HashMap`] of courses. +/// +/// # Panics +/// +/// Panics if [`Selector`] fails to parse. +#[must_use] +pub fn parse_document(document: &Html) -> HashMap<&str, Course<'_>> { + let mut course_map: HashMap<&str, Course> = HashMap::new(); + + let mut subject_area = ""; + let mut working_course = Course::default(); + + for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) { + let elem_value = elem.value(); + + if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { + course_map + .entry(working_course.title) + .and_modify(|c| { + c.subject_areas.insert(subject_area); + }) + .or_insert(working_course); + working_course = Course::default(); + working_course.subject_areas.insert(subject_area); + + working_course.parse_courseid(elem); + } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { + working_course.offered = false; + } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { + if let Some(subject_area_name) = elem.first_child().and_then(|child| { + child + .first_child() + .and_then(|nexted_child| nexted_child.value().as_text()) + }) { + subject_area = &**subject_area_name; + } + } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { + let description = elem + .first_child() + .and_then(|el| el.first_child()?.value().as_text()) + .map(|t| &**t); + + working_course.description = description; + } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { + working_course.parse_timetable(elem); + } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { + working_course.parse_coursepoints(elem); + } } + + course_map.remove(""); + + course_map } |