diff options
| author | Sophie Forrest <git@sophieforrest.com> | 2024-09-06 13:55:19 +1200 |
|---|---|---|
| committer | Sophie Forrest <git@sophieforrest.com> | 2024-09-06 13:55:19 +1200 |
| commit | c4ce297ff951583c9ffe3a88aa22933577b329da (patch) | |
| tree | 93f55666b0ba0d74be87e10380dcc25a84a17677 /src/main.rs | |
| parent | 151ab2e8a837242f9654be1280286dc9514fe49c (diff) | |
refactor: make clippy happy + no cloning
Diffstat (limited to '')
| -rw-r--r-- | src/main.rs | 331 |
1 files changed, 73 insertions, 258 deletions
diff --git a/src/main.rs b/src/main.rs index ee7686e..05f3976 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,270 +1,85 @@ // SPDX-License-Identifier: AGPL-3.0-or-later -use serde::{Deserialize, Serialize}; -use std::{ - collections::{HashMap, HashSet}, - fmt, fs, -}; +//! # VUW Course scraper +//! +//! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly +//! parse prerequisites, however. -use tracing::{debug, info, level_filters::LevelFilter}; +use std::{collections::HashMap, fs}; use scraper::{CaseSensitivity, Html, Selector}; +use serde::{Deserialize, Serialize}; +use tracing::level_filters::LevelFilter; +use vuw_course_scraper::Course; -// TODO: Use string slices to avoid clones? -#[derive(Clone, Debug, Deserialize, Serialize)] -struct Course<'a> { - description: Option<&'a str>, - offered: bool, - points: u8, - prerequisites: Vec<String>, - restrictions: Vec<String>, - subject_areas: HashSet<String>, - subtitle: &'a str, - timetable: Vec<CourseOffering>, - title: &'a str, -} - -#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] -struct CourseOffering { - course_reference_number: u16, - trimester: Trimester, -} - -#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, PartialOrd, Ord, Serialize)] -enum Trimester { - One, - Two, - Three, - BlockDates, - PartYear, - OneTwo, - TwoThree, - ThreeOne, - FullYear, -} - -impl TryFrom<&str> for Trimester { - type Error = String; - - fn try_from(value: &str) -> Result<Self, Self::Error> { - match value { - "1/3" => Ok(Self::One), - "2/3" => Ok(Self::Two), - "3/3" => Ok(Self::Three), - "block dates/3" => Ok(Self::BlockDates), - "part year/3" => Ok(Self::PartYear), - "1+2/3" => Ok(Self::OneTwo), - "2+3/3" => Ok(Self::TwoThree), - "3+1/3" => Ok(Self::ThreeOne), - "1+2+3/3" => Ok(Self::FullYear), - _ => Err(String::from("Invalid trimester.")), - } - } -} - +/// Utility struct for exporting to JSON. #[derive(Clone, Deserialize, Serialize)] struct JsonExport<'a> { - #[serde(borrow)] - courses: HashMap<&'a str, Course<'a>>, -} - -impl Default for Course<'_> { - fn default() -> Self { - Self { - description: Option::default(), - offered: true, - points: u8::default(), - prerequisites: Vec::default(), - restrictions: Vec::default(), - subject_areas: HashSet::default(), - subtitle: "", - timetable: Vec::default(), - title: "", - } - } -} - -impl fmt::Display for Course<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}", - self.title, - self.subtitle, - self.offered, - self.subject_areas - .iter() - // Necessary as Rust refuses to build Vec<&str> with &String. - .map(|s| &s[..]) - .collect::<Vec<&str>>() - .join(", "), - ) - } + /// [`HashMap`] of all courses. + #[serde(borrow)] + courses: HashMap<&'a str, Course<'a>>, } fn main() { - tracing_subscriber::fmt() - .with_max_level(LevelFilter::INFO) - .init(); - - let html = include_str!("../courses.html"); - - let document = Html::parse_document(html); - - let mut course_map: HashMap<&str, Course> = HashMap::new(); - - let mut subject_area = String::new(); - let mut working_course = Course::default(); - - for elem in document.select(&Selector::parse("p").unwrap()) { - let elem_value = elem.value(); - - if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { - course_map - .entry(working_course.title) - .and_modify(|c| { - c.subject_areas.insert(subject_area.clone()); - }) - .or_insert(working_course); - working_course = Course::default(); - working_course.subject_areas.insert(subject_area.clone()); - - elem.children().for_each(|child| { - child.children().for_each(|c| { - if c.value().is_text() { - let working = &c.value().as_text().unwrap()[..]; - - // Skip over space. - working_course.title = &working[..working.len() - 1]; - } else { - working_course.subtitle = &c - .first_child() - .unwrap() - .value() - .as_text() - .unwrap() - // Skip over "- ". - [4..]; - } - }); - }); - } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { - working_course.offered = false; - } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { - subject_area = elem - .first_child() - .unwrap() - .first_child() - .unwrap() - .value() - .as_text() - .unwrap() - .to_string(); - } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { - let description = elem - .first_child() - .and_then(|el| el.first_child()?.value().as_text()) - .map(|t| &t[..]); - - working_course.description = description; - - // println!("{}", working_course.description); - } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { - // Parse timetable / CRNs. - let details = elem - .first_child() - .and_then(|el| el.first_child()?.value().as_text()) - .map(|t| t.to_string()); - - if let Some(details) = details { - let details_split: Vec<&str> = details.split(" • ").take(2).collect(); - - info!("{:#?}", &details_split); - - let offering = CourseOffering { - course_reference_number: details_split.last().unwrap()[4..] - .parse::<u16>() - .unwrap(), - trimester: Trimester::try_from(*details_split.first().unwrap()).unwrap(), - }; - - working_course.timetable.push(offering); - } - } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { - // Parse course points, prerequisites, and exclusions. - let details = elem - .first_child() - .and_then(|el| el.first_child()?.value().as_text()) - .map(|t| t.to_string()); - - if let Some(details) = details { - let details_split: Vec<&str> = details.split(" • ").take(2).collect(); - - info!("{:#?}", &details_split); - - // Occasionally there is extra whitespace here, so this needs to be trimmed. - let points = details_split.first().unwrap().trim(); - debug!("{:?}", points); - - let points_slice = &points[..points.len() - 4]; - info!("{:?}", points_slice); - - let points = points_slice.parse::<u8>().unwrap(); - info!("{:?}", points); - - working_course.points = points; - - if let Some(requirements) = details_split.last().map(|s| s.trim()) { - if requirements.starts_with("(X)") { - working_course.restrictions = requirements[4..] - .split(&[';', ',']) - .map(str::trim) - .map(str::to_owned) - .collect::<Vec<String>>(); - } else { - let requirements = &requirements[4..].split(" (X) ").collect::<Vec<&str>>(); - - working_course.prerequisites = requirements - .first() - .unwrap() - .split(&[',', ';']) - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(str::to_owned) - .collect(); - - if requirements.len() > 1 { - working_course.restrictions = requirements - .last() - .unwrap() - .split(&[',', ';']) - .map(str::trim) - .map(str::to_owned) - .collect(); - } - } - - info!("{requirements}"); - } - } - } - } - - debug!("{:?}", course_map.get("COMP 102")); - - course_map.remove(""); - - fs::write( - "./export.json", - simd_json::serde::to_string(&JsonExport { - courses: course_map, - }) - .unwrap(), - ) - .unwrap(); - - // course_map - // .values() - // .for_each(|c| println!("{:#?}", c.subject_areas)); - // course_map.values().for_each(|c| println!("{c}")); + tracing_subscriber::fmt() + .with_max_level(LevelFilter::INFO) + .init(); + + let html = &fs::read_to_string("./courses.html").expect("file does not exist"); + + let document = Html::parse_document(html); + + let mut course_map: HashMap<&str, Course> = HashMap::new(); + + let mut subject_area = ""; + let mut working_course = Course::default(); + + for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) { + let elem_value = elem.value(); + + if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { + course_map + .entry(working_course.title) + .and_modify(|c| { + c.subject_areas.insert(subject_area); + }) + .or_insert(working_course); + working_course = Course::default(); + working_course.subject_areas.insert(subject_area); + + working_course.parse_courseid(elem); + } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { + working_course.offered = false; + } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { + if let Some(subject_area_name) = elem.first_child().and_then(|child| { + child + .first_child() + .and_then(|nexted_child| nexted_child.value().as_text()) + }) { + subject_area = &**subject_area_name; + } + } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { + let description = elem + .first_child() + .and_then(|el| el.first_child()?.value().as_text()) + .map(|t| &**t); + + working_course.description = description; + } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { + working_course.parse_timetable(elem); + } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { + working_course.parse_coursepoints(elem); + } + } + + course_map.remove(""); + + fs::write( + "./export.json", + simd_json::serde::to_string(&JsonExport { + courses: course_map, + }) + .expect("json should parse correctly"), + ) + .expect("file should be writable"); } |