diff options
| author | Sophie Forrest <git@sophieforrest.com> | 2024-09-10 08:53:41 +1200 |
|---|---|---|
| committer | Sophie Forrest <git@sophieforrest.com> | 2024-09-10 08:53:41 +1200 |
| commit | 08d060660f2f6bd678770026e109e0cd7429e6f1 (patch) | |
| tree | 0769a5b614be7c1b5a1a3c44da7c00999c759f63 | |
| parent | d0256057ac4e2d18f30a7cd5845d315f3167ac67 (diff) | |
refactor: move parsing into separate function
Preparation for adding download capabilities.
Diffstat (limited to '')
| -rw-r--r-- | src/lib.rs | 74 | ||||
| -rw-r--r-- | src/main.rs | 49 |
2 files changed, 60 insertions, 63 deletions
diff --git a/src/lib.rs b/src/lib.rs index 38af605..3dc8e99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,9 +5,12 @@ //! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly //! parse prerequisites, however. -use std::{collections::HashSet, fmt}; +use std::{ + collections::{HashMap, HashSet}, + fmt, +}; -use scraper::ElementRef; +use scraper::{CaseSensitivity, ElementRef, Html, Selector}; use serde::{Deserialize, Serialize}; use tracing::{debug, info}; @@ -269,20 +272,57 @@ impl TryFrom<&str> for Trimester { } } -impl fmt::Display for Course<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}", - self.title, - self.subtitle, - self.offered, - self.subject_areas - .iter() - // Necessary as Rust refuses to build Vec<&str> with &String. - .map(|s| &**s) - .collect::<Vec<&str>>() - .join(", "), - ) +/// Parses a [`Html`] document into a [`HashMap`] of courses. +/// +/// # Panics +/// +/// Panics if [`Selector`] fails to parse. +#[must_use] +pub fn parse_document(document: &Html) -> HashMap<&str, Course<'_>> { + let mut course_map: HashMap<&str, Course> = HashMap::new(); + + let mut subject_area = ""; + let mut working_course = Course::default(); + + for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) { + let elem_value = elem.value(); + + if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { + course_map + .entry(working_course.title) + .and_modify(|c| { + c.subject_areas.insert(subject_area); + }) + .or_insert(working_course); + working_course = Course::default(); + working_course.subject_areas.insert(subject_area); + + working_course.parse_courseid(elem); + } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { + working_course.offered = false; + } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { + if let Some(subject_area_name) = elem.first_child().and_then(|child| { + child + .first_child() + .and_then(|nexted_child| nexted_child.value().as_text()) + }) { + subject_area = &**subject_area_name; + } + } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { + let description = elem + .first_child() + .and_then(|el| el.first_child()?.value().as_text()) + .map(|t| &**t); + + working_course.description = description; + } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { + working_course.parse_timetable(elem); + } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { + working_course.parse_coursepoints(elem); + } } + + course_map.remove(""); + + course_map } diff --git a/src/main.rs b/src/main.rs index 05f3976..17a95c7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,10 +7,10 @@ use std::{collections::HashMap, fs}; -use scraper::{CaseSensitivity, Html, Selector}; +use scraper::Html; use serde::{Deserialize, Serialize}; use tracing::level_filters::LevelFilter; -use vuw_course_scraper::Course; +use vuw_course_scraper::{parse_document, Course}; /// Utility struct for exporting to JSON. #[derive(Clone, Deserialize, Serialize)] @@ -29,50 +29,7 @@ fn main() { let document = Html::parse_document(html); - let mut course_map: HashMap<&str, Course> = HashMap::new(); - - let mut subject_area = ""; - let mut working_course = Course::default(); - - for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) { - let elem_value = elem.value(); - - if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { - course_map - .entry(working_course.title) - .and_modify(|c| { - c.subject_areas.insert(subject_area); - }) - .or_insert(working_course); - working_course = Course::default(); - working_course.subject_areas.insert(subject_area); - - working_course.parse_courseid(elem); - } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { - working_course.offered = false; - } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { - if let Some(subject_area_name) = elem.first_child().and_then(|child| { - child - .first_child() - .and_then(|nexted_child| nexted_child.value().as_text()) - }) { - subject_area = &**subject_area_name; - } - } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { - let description = elem - .first_child() - .and_then(|el| el.first_child()?.value().as_text()) - .map(|t| &**t); - - working_course.description = description; - } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { - working_course.parse_timetable(elem); - } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { - working_course.parse_coursepoints(elem); - } - } - - course_map.remove(""); + let course_map = parse_document(&document); fs::write( "./export.json", |