// SPDX-License-Identifier: AGPL-3.0-or-later //! # VUW Course scraper //! //! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly //! parse prerequisites, however. use std::collections::{HashMap, HashSet}; use scraper::{CaseSensitivity, ElementRef, Html, Selector}; use serde::{Deserialize, Serialize}; use tracing::{debug, info}; /// Slice used for splitting requirements for parsing. const SPLIT_SLICE: &[char] = &[';', ',']; /// A VUW course, along with all relevant data. #[derive(Clone, Debug, Deserialize, Serialize)] #[non_exhaustive] pub struct Course<'a> { /// Description of the course. pub description: Option<&'a str>, /// Whether this course is offered in the upcoming year. pub offered: bool, /// Amount of points this course is worth. pub points: f32, /// Courses that must be taken before this course. pub prerequisites: Vec<&'a str>, /// Courses that cannot be taken if you take this course. pub restrictions: Vec<&'a str>, /// Subject areas this course belongs to. pub subject_areas: HashSet<&'a str>, /// Subtitle of this course, its longer name. pub subtitle: &'a str, /// Timetable of this course, includes trimesters and CRNs. pub timetable: Vec, /// Title of this course, also known as the code. pub title: &'a str, } impl<'a> Course<'a> { /// Parses the courde id. /// /// # Panics /// /// Panics if string is sliced in the middle of a character. pub fn parse_courseid(&mut self, elem: ElementRef<'a>) { elem.children().for_each(|child| { child.children().for_each(|c| { if let Some(text) = c.value().as_text() { // The actual text we're looking for let text: &str = text.trim(); self.title = text; } else if let Some(text) = c .first_child() .and_then(|node| node.value().as_text().map(|text| &**text)) { if let Some((indice, _char)) = text.char_indices().nth(1) { // Skip over "-" self.subtitle = text.get(indice..).expect("indice should be valid").trim(); } } }); }); } /// Parses the course points, prerequisites, and restrictions from the given element. /// /// # Panics /// /// Panics if parsing fails, or a slice is made in the middle of a character. pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) { // Parse course points, prerequisites, and exclusions. let details = elem .first_child() .and_then(|el| el.first_child()?.value().as_text()); if let Some(details) = details { let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect(); info!("{:#?}", &details_split); // Occasionally there is extra whitespace here, so this needs to be trimmed. let points = details_split.first().expect("split should exist").trim(); debug!("{:?}", points); let points_slice = &points.get(..points.len() - 4).expect("should be at indice"); info!("{:?}", points_slice); let points = points_slice .parse::() .expect("should correctly parse points"); info!("{:?}", points); self.points = points; if let Some(requirements) = details_split.last().map(|s| s.trim()) { if requirements.starts_with("(X)") { self.restrictions = requirements .get(4..) .expect("should be at indice") .split(SPLIT_SLICE) .map(str::trim) .collect::>(); } else if requirements.starts_with("(P)") { let requirements = &requirements .get(4..) .expect("should be at indice") .split(" (X) ") .collect::>(); self.prerequisites = requirements .first() .map(|s| { s.split(SPLIT_SLICE) .map(str::trim) .filter(|s| !s.is_empty()) .collect::>() }) .unwrap_or_default(); if requirements.len() > 1 { self.restrictions = requirements .last() .map(|s| s.split(SPLIT_SLICE).map(str::trim).collect::>()) .unwrap_or_default(); } } else { self.prerequisites = vec![requirements]; } info!("{requirements}"); } } } /// Parses the course timetable. /// /// # Panics /// /// Panics if CRN doesn't exist, trimester doesn't exist, slice is made in the middle of /// a byte, CRN isn't parseable, or trimester isn't parseable. pub fn parse_timetable(&mut self, elem: ElementRef<'a>) { // Parse timetable / CRNs. let details = elem .first_child() .and_then(|el| el.first_child()?.value().as_text()); if let Some(details) = details { let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect(); info!("{:#?}", &details_split); let offering = CourseOffering::new( details_split .last() .expect("course reference number should exist") .get(4..) .expect("course reference number digits should start at this indice") .split_whitespace() .next() .expect("course reference number should exist") .parse::() .expect("course reference number should be parseable"), Trimester::try_from( *details_split .first() .expect("trimester element should exist"), ) .expect("should be parseable into a trimester"), ); self.timetable.push(offering); } } } impl Default for Course<'_> { fn default() -> Self { Self { description: Option::default(), offered: true, points: f32::default(), prerequisites: Vec::default(), restrictions: Vec::default(), subject_areas: HashSet::default(), subtitle: "", timetable: Vec::default(), title: "", } } } /// A course offering, includes the CRN and [`Trimester`]. #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] #[non_exhaustive] pub struct CourseOffering { /// Reference number for this coursem e.g. 11723. pub course_reference_number: u16, /// Trimester this course is offered in. pub trimester: Trimester, } impl CourseOffering { /// Creates a new [`CourseOffering`]. #[must_use] pub const fn new(course_reference_number: u16, trimester: Trimester) -> Self { Self { course_reference_number, trimester, } } } /// Trimester information Victoria University of Wellington offers. #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, PartialOrd, Ord, Serialize)] pub enum Trimester { /// Trimester one. One, /// Trimester two. Two, /// Trimester three. Three, /// Block dates. See course page for more information. BlockDates, /// Part year. See course page for more information. PartYear, /// Trimesters one and two. OneTwo, /// Trimesters two and three. TwoThree, /// Trimesters three and one. ThreeOne, /// Trimesters one, two, and three. FullYear, } impl TryFrom<&str> for Trimester { type Error = String; fn try_from(value: &str) -> Result { match value { "1/3" => Ok(Self::One), "2/3" => Ok(Self::Two), "3/3" => Ok(Self::Three), "block dates/3" => Ok(Self::BlockDates), "part year/3" => Ok(Self::PartYear), "1+2/3" => Ok(Self::OneTwo), "2+3/3" => Ok(Self::TwoThree), "3+1/3" => Ok(Self::ThreeOne), "1+2+3/3" | "2+3+1/3" | "full year" => Ok(Self::FullYear), _ => Err(String::from("Invalid trimester.")), } } } /// Parses a [`Html`] document into a [`HashMap`] of courses. /// /// # Panics /// /// Panics if [`Selector`] fails to parse. #[must_use] pub fn parse_document(document: &Html) -> HashMap<&str, Course<'_>> { let mut course_map: HashMap<&str, Course> = HashMap::new(); let mut subject_area = ""; let mut working_course = Course::default(); // PERF: Could we gain a meaningful speed boost by splitting this into chunks of each course? for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) { let elem_value = elem.value(); if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { course_map .entry(working_course.title) .and_modify(|c| { c.subject_areas.insert(subject_area); }) .or_insert(working_course); working_course = Course::default(); working_course.subject_areas.insert(subject_area); working_course.parse_courseid(elem); } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { working_course.offered = false; } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { if let Some(subject_area_name) = elem.first_child().and_then(|child| { child .first_child() .and_then(|nexted_child| nexted_child.value().as_text()) }) { subject_area = &**subject_area_name; } } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { let description = elem .first_child() .and_then(|el| el.first_child()?.value().as_text()) .map(|t| &**t); working_course.description = description; } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { working_course.parse_timetable(elem); } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { working_course.parse_coursepoints(elem); } } course_map.remove(""); course_map }