// SPDX-License-Identifier: AGPL-3.0-or-later //! # VUW Course scraper //! //! Program capable of parsing VUWs courses from the registry. mod parser; use std::collections::{HashMap, HashSet}; use parser::{course_offering, offering, subtitle, title}; use scraper::{CaseSensitivity, ElementRef, Html, Selector}; use serde::{Deserialize, Serialize}; use tracing::info; /// Alias to the nom error type. type NomError<'a> = nom::Err>; /// A VUW course, along with all relevant data. #[derive(Clone, Debug, Deserialize, Serialize)] #[non_exhaustive] pub struct Course<'a> { /// Courses that must be taken at the same time as this course. pub corequisites: Vec<&'a str>, /// Description of the course. pub description: Option<&'a str>, /// Whether this course is offered in the upcoming year. pub offered: bool, /// Amount of points this course is worth. pub points: f32, /// Courses that must be taken before this course. pub prerequisites: Vec<&'a str>, /// Courses that cannot be taken if you take this course. pub restrictions: Vec<&'a str>, /// Subject areas this course belongs to. pub subject_areas: HashSet<&'a str>, /// Subtitle of this course, its longer name. pub subtitle: &'a str, /// Timetable of this course, includes trimesters and CRNs. pub timetable: Vec, /// Title of this course, also known as the code. pub title: &'a str, } impl<'a> Course<'a> { /// Parses the course id. /// /// # Errors /// /// This function will return an error if nom fails to parse the course title or subtitle. pub fn parse_courseid(&mut self, elem: ElementRef<'a>) -> Result<(), NomError> { for child in elem.children().flat_map(|child| child.children()) { if let Some(text) = child.value().as_text() { self.title = title(text)?.1; } else if let Some(text) = child .first_child() .and_then(|node| node.value().as_text().map(|text| &**text)) { self.subtitle = subtitle(text)?.1; } } Ok(()) } /// Parses the course points, prerequisites, and restrictions from the given element. /// /// # Errors /// /// Panics if parsing fails. pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) -> Result<(), NomError> { // Parse course points, prerequisites, and exclusions. let details = elem .first_child() .and_then(|el| el.first_child()?.value().as_text()); if let Some(details) = details { // Parse the info from our nom parser. let (_, (points, requirements)) = course_offering(details)?; self.points = points; if let Some((prerequisites, corequisites, restrictions)) = requirements { // None of these are guaranteed to exist, so we need to use let Some for these. if let Some(prerequisites) = prerequisites { self.prerequisites.push(prerequisites); } if let Some(corequisites) = corequisites { self.corequisites.push(corequisites); } if let Some(restrictions) = restrictions { self.restrictions.push(restrictions); } } } Ok(()) } /// Parses the course timetable. /// /// # Errors /// /// This function will return an error if nom fails to parse the timetable from the provided /// data. pub fn parse_timetable(&mut self, elem: ElementRef<'a>) -> Result<(), NomError> { // Parse timetable / CRNs. let details = elem .first_child() .and_then(|el| el.first_child()?.value().as_text()); if let Some(details) = details { info!("{:#?}", &details); self.timetable.push(offering(details)?.1); } Ok(()) } } impl Default for Course<'_> { fn default() -> Self { Self { corequisites: Vec::default(), description: Option::default(), offered: true, points: f32::default(), prerequisites: Vec::default(), restrictions: Vec::default(), subject_areas: HashSet::default(), subtitle: "", timetable: Vec::default(), title: "", } } } /// A course offering, includes the CRN and [`Trimester`]. #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] #[non_exhaustive] pub struct CourseOffering { /// Reference number for this coursem e.g. 11723. pub course_reference_number: u16, /// Trimester this course is offered in. pub trimester: Trimester, } impl CourseOffering { /// Creates a new [`CourseOffering`]. #[must_use] pub const fn new(course_reference_number: u16, trimester: Trimester) -> Self { Self { course_reference_number, trimester, } } } /// Trimester information Victoria University of Wellington offers. #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, PartialOrd, Ord, Serialize)] pub enum Trimester { /// Trimester one. One, /// Trimester two. Two, /// Trimester three. Three, /// Block dates. See course page for more information. BlockDates, /// Part year. See course page for more information. PartYear, /// Trimesters one and two. OneTwo, /// Trimesters two and three. TwoThree, /// Trimesters three and one. ThreeOne, /// Trimesters one, two, and three. FullYear, } impl TryFrom<&str> for Trimester { type Error = &'static str; fn try_from(value: &str) -> Result { match value { "1/3" => Ok(Self::One), "2/3" => Ok(Self::Two), "3/3" => Ok(Self::Three), "block dates/3" => Ok(Self::BlockDates), "part year/3" => Ok(Self::PartYear), "1+2/3" => Ok(Self::OneTwo), "2+3/3" => Ok(Self::TwoThree), "3+1/3" => Ok(Self::ThreeOne), "1+2+3/3" | "2+3+1/3" | "3+1+2/3" | "full year" => Ok(Self::FullYear), _ => Err("Invalid trimester."), } } } /// Parses a [`Html`] document into a [`HashMap`] of courses. /// /// # Panics /// /// Panics if [`Selector`] fails to parse. #[must_use] pub fn parse_document(document: &Html) -> HashMap<&str, Course<'_>> { let mut course_map: HashMap<&str, Course> = HashMap::new(); let mut subject_area = ""; let mut working_course = Course::default(); // PERF: Could we gain a meaningful speed boost by splitting this into chunks of each course? for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) { let elem_value = elem.value(); if elem_value.has_class("courseid", CaseSensitivity::CaseSensitive) { course_map .entry(working_course.title) .and_modify(|c| { c.subject_areas.insert(subject_area); }) .or_insert(working_course); working_course = Course::default(); working_course.subject_areas.insert(subject_area); working_course .parse_courseid(elem) .expect("could not parse courseid"); } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { working_course.offered = false; } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { if let Some(subject_area_name) = elem.first_child().and_then(|child| { child .first_child() .and_then(|nexted_child| nexted_child.value().as_text()) }) { subject_area = &**subject_area_name; } } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { let description = elem .first_child() .and_then(|el| el.first_child()?.value().as_text()) .map(|t| &**t); working_course.description = description; } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { working_course .parse_timetable(elem) .expect("could not parse timetable"); } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { working_course .parse_coursepoints(elem) .expect("could not parse coursepoints"); } } course_map.remove(""); course_map }