diff options
| author | Sophie Forrest <git@sophieforrest.com> | 2024-09-06 13:55:19 +1200 |
|---|---|---|
| committer | Sophie Forrest <git@sophieforrest.com> | 2024-09-06 13:55:19 +1200 |
| commit | c4ce297ff951583c9ffe3a88aa22933577b329da (patch) | |
| tree | 93f55666b0ba0d74be87e10380dcc25a84a17677 | |
| parent | 151ab2e8a837242f9654be1280286dc9514fe49c (diff) | |
refactor: make clippy happy + no cloning
| -rw-r--r-- | .rustfmt.toml | 13 | ||||
| -rw-r--r-- | .taplo.toml | 10 | ||||
| -rw-r--r-- | Cargo.toml | 66 | ||||
| -rw-r--r-- | clippy.toml | 1 | ||||
| -rw-r--r-- | src/lib.rs | 287 | ||||
| -rw-r--r-- | src/main.rs | 331 |
6 files changed, 450 insertions, 258 deletions
diff --git a/.rustfmt.toml b/.rustfmt.toml new file mode 100644 index 0000000..3a60dda --- /dev/null +++ b/.rustfmt.toml @@ -0,0 +1,13 @@ +edition = "2021" + +format_code_in_doc_comments = true +format_strings = true +group_imports = "StdExternalCrate" +hard_tabs = true +hex_literal_case = "Upper" +imports_granularity = "Crate" +reorder_impl_items = true +reorder_modules = true +use_field_init_shorthand = true +use_try_shorthand = true +wrap_comments = true diff --git a/.taplo.toml b/.taplo.toml new file mode 100644 index 0000000..9773ffd --- /dev/null +++ b/.taplo.toml @@ -0,0 +1,10 @@ +[formatting] +indent_string = " " +reorder_keys = false + +[[rule]] +include = ["**/Cargo.toml"] +keys = ["dependencies", "lints.clippy"] + +[rule.formatting] +reorder_keys = true diff --git a/Cargo.toml b/Cargo.toml index 3356402..1d95862 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,69 @@ serde = { version = "1.0.209", features = ["derive"] } simd-json = "0.13.10" tracing = "0.1.40" tracing-subscriber = "0.3.18" + +[lints.rust] +missing_copy_implementations = "warn" +missing_debug_implementations = "warn" +missing_docs = "warn" +single_use_lifetimes = "warn" +unsafe_code = "warn" +unused = { level = "warn", priority = -1 } + +[lints.clippy] +complexity = { level = "warn", priority = -1 } +nursery = { level = "warn", priority = -1 } +pedantic = { level = "warn", priority = -1 } +perf = { level = "warn", priority = -1 } +suspicious = { level = "warn", priority = -1 } +alloc_instead_of_core = "warn" +as_underscore = "warn" +clone_on_ref_ptr = "warn" +create_dir = "warn" +dbg_macro = "warn" +default_numeric_fallback = "warn" +default_union_representation = "warn" +deref_by_slicing = "warn" +empty_structs_with_brackets = "warn" +exit = "warn" +filetype_is_file = "warn" +fn_to_numeric_cast = "warn" +format_push_string = "warn" +get_unwrap = "warn" +if_then_some_else_none = "warn" +implicit_return = "allow" +indexing_slicing = "warn" +large_include_file = "warn" +let_underscore_must_use = "warn" +lossy_float_literal = "warn" +map_err_ignore = "warn" +mem_forget = "warn" +missing_docs_in_private_items = "warn" +missing_trait_methods = "warn" +mod_module_files = "warn" +multiple_inherent_impl = "warn" +mutex_atomic = "warn" +needless_return = "warn" +non_ascii_literal = "warn" +panic_in_result_fn = "warn" +pattern_type_mismatch = "warn" +rc_buffer = "warn" +rc_mutex = "warn" +rest_pat_in_fully_bound_structs = "warn" +same_name_method = "warn" +separated_literal_suffix = "warn" +str_to_string = "warn" +string_add = "warn" +string_slice = "warn" +string_to_string = "warn" +tabs_in_doc_comments = "allow" +try_err = "warn" +undocumented_unsafe_blocks = "warn" +unnecessary_self_imports = "warn" +unneeded_field_pattern = "warn" +unwrap_in_result = "warn" +unwrap_used = "warn" +use_debug = "warn" +verbose_file_reads = "warn" +wildcard_dependencies = "warn" +wildcard_enum_match_arm = "warn" diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..cda8d17 --- /dev/null +++ b/clippy.toml @@ -0,0 +1 @@ +avoid-breaking-exported-api = false diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..6fbf7dc --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +//! # VUW Course scraper +//! +//! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly +//! parse prerequisites, however. + +use std::{collections::HashSet, fmt}; + +use scraper::ElementRef; +use serde::{Deserialize, Serialize}; +use tracing::{debug, info}; + +/// Slice used for splitting requirements for parsing. +const SPLIT_SLICE: &[char] = &[';', ',']; + +/// A VUW course, along with all relevant data. +#[derive(Clone, Debug, Deserialize, Serialize)] +#[non_exhaustive] +pub struct Course<'a> { + /// Description of the course. + pub description: Option<&'a str>, + + /// Whether this course is offered in the upcoming year. + pub offered: bool, + + /// Amount of points this course is worth. + pub points: u8, + + /// Courses that must be taken before this course. + pub prerequisites: Vec<&'a str>, + + /// Courses that cannot be taken if you take this course. + pub restrictions: Vec<&'a str>, + + /// Subject areas this course belongs to. + pub subject_areas: HashSet<&'a str>, + + /// Subtitle of this course, its longer name. + pub subtitle: &'a str, + + /// Timetable of this course, includes trimesters and CRNs. + pub timetable: Vec<CourseOffering>, + + /// Title of this course, also known as the code. + pub title: &'a str, +} + +impl<'a> Course<'a> { + /// . + /// + /// # Panics + /// + /// Panics if . + pub fn parse_courseid(&mut self, elem: ElementRef<'a>) { + elem.children().for_each(|child| { + child.children().for_each(|c| { + if let Some(text) = c.value().as_text() { + // The actual text we're looking for + let text: &str = text.trim(); + + self.title = text; + } else if let Some(text) = c + .first_child() + .and_then(|node| node.value().as_text().map(|text| &**text)) + { + if let Some((indice, _char)) = text.char_indices().nth(1) { + // Skip over "-" + self.subtitle = text.get(indice..).expect("indice should be valid").trim(); + } + } + }); + }); + } + + /// Parses the course points, prerequisites, and restrictions from the given element. + /// + /// # Panics + /// + /// Panics if . + pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) { + // Parse course points, prerequisites, and exclusions. + let details = elem + .first_child() + .and_then(|el| el.first_child()?.value().as_text()); + + if let Some(details) = details { + let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect(); + + info!("{:#?}", &details_split); + + // Occasionally there is extra whitespace here, so this needs to be trimmed. + let points = details_split.first().expect("split should exist").trim(); + debug!("{:?}", points); + + let points_slice = &points.get(..points.len() - 4).expect("should be at indice"); + info!("{:?}", points_slice); + + let points = points_slice + .parse::<u8>() + .expect("should correctly parse points"); + info!("{:?}", points); + + self.points = points; + + if let Some(requirements) = details_split.last().map(|s| s.trim()) { + if requirements.starts_with("(X)") { + self.restrictions = requirements + .get(4..) + .expect("should be at indice") + .split(SPLIT_SLICE) + .map(str::trim) + .collect::<Vec<&str>>(); + } else if requirements.starts_with("(P)") { + let requirements = &requirements + .get(4..) + .expect("should be at indice") + .split(" (X) ") + .collect::<Vec<&str>>(); + + self.prerequisites = requirements + .first() + .map(|s| { + s.split(SPLIT_SLICE) + .map(str::trim) + .filter(|s| !s.is_empty()) + .collect::<Vec<&str>>() + }) + .unwrap_or_default(); + + if requirements.len() > 1 { + self.restrictions = requirements + .last() + .map(|s| s.split(SPLIT_SLICE).map(str::trim).collect::<Vec<&str>>()) + .unwrap_or_default(); + } + } else { + self.prerequisites = vec![requirements]; + } + + info!("{requirements}"); + } + } + } + + /// . + /// + /// # Panics + /// + /// Panics if . + pub fn parse_timetable(&mut self, elem: ElementRef<'a>) { + // Parse timetable / CRNs. + let details = elem + .first_child() + .and_then(|el| el.first_child()?.value().as_text()); + + if let Some(details) = details { + let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect(); + + info!("{:#?}", &details_split); + + let offering = CourseOffering::new( + details_split + .last() + .expect("course reference number should exist") + .get(4..) + .expect("course reference number digits should start at this indice") + .parse::<u16>() + .expect("course reference number should be parseable"), + Trimester::try_from( + *details_split + .first() + .expect("trimester element should exist"), + ) + .expect("should be parseable into a trimester"), + ); + + self.timetable.push(offering); + } + } +} + +impl Default for Course<'_> { + fn default() -> Self { + Self { + description: Option::default(), + offered: true, + points: u8::default(), + prerequisites: Vec::default(), + restrictions: Vec::default(), + subject_areas: HashSet::default(), + subtitle: "", + timetable: Vec::default(), + title: "", + } + } +} + +/// A course offering, includes the CRN and [`Trimester`]. +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[non_exhaustive] +pub struct CourseOffering { + /// Reference number for this coursem e.g. 11723. + pub course_reference_number: u16, + + /// Trimester this course is offered in. + pub trimester: Trimester, +} + +impl CourseOffering { + /// Creates a new [`CourseOffering`]. + #[must_use] + pub const fn new(course_reference_number: u16, trimester: Trimester) -> Self { + Self { + course_reference_number, + trimester, + } + } +} + +/// Trimester information Victoria University of Wellington offers. +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, PartialOrd, Ord, Serialize)] +pub enum Trimester { + /// Trimester one. + One, + + /// Trimester two. + Two, + + /// Trimester three. + Three, + + /// Block dates. See course page for more information. + BlockDates, + + /// Part year. See course page for more information. + PartYear, + + /// Trimesters one and two. + OneTwo, + + /// Trimesters two and three. + TwoThree, + + /// Trimesters three and one. + ThreeOne, + + /// Trimesters one, two, and three. + FullYear, +} + +impl TryFrom<&str> for Trimester { + type Error = String; + + fn try_from(value: &str) -> Result<Self, Self::Error> { + match value { + "1/3" => Ok(Self::One), + "2/3" => Ok(Self::Two), + "3/3" => Ok(Self::Three), + "block dates/3" => Ok(Self::BlockDates), + "part year/3" => Ok(Self::PartYear), + "1+2/3" => Ok(Self::OneTwo), + "2+3/3" => Ok(Self::TwoThree), + "3+1/3" => Ok(Self::ThreeOne), + "1+2+3/3" => Ok(Self::FullYear), + _ => Err(String::from("Invalid trimester.")), + } + } +} + +impl fmt::Display for Course<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}", + self.title, + self.subtitle, + self.offered, + self.subject_areas + .iter() + // Necessary as Rust refuses to build Vec<&str> with &String. + .map(|s| &**s) + .collect::<Vec<&str>>() + .join(", "), + ) + } +} diff --git a/src/main.rs b/src/main.rs index ee7686e..05f3976 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,270 +1,85 @@ // SPDX-License-Identifier: AGPL-3.0-or-later -use serde::{Deserialize, Serialize}; -use std::{ - collections::{HashMap, HashSet}, - fmt, fs, -}; +//! # VUW Course scraper +//! +//! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly +//! parse prerequisites, however. -use tracing::{debug, info, level_filters::LevelFilter}; +use std::{collections::HashMap, fs}; use scraper::{CaseSensitivity, Html, Selector}; +use serde::{Deserialize, Serialize}; +use tracing::level_filters::LevelFilter; +use vuw_course_scraper::Course; -// TODO: Use string slices to avoid clones? -#[derive(Clone, Debug, Deserialize, Serialize)] -struct Course<'a> { - description: Option<&'a str>, - offered: bool, - points: u8, - prerequisites: Vec<String>, - restrictions: Vec<String>, - subject_areas: HashSet<String>, - subtitle: &'a str, - timetable: Vec<CourseOffering>, - title: &'a str, -} - -#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] -struct CourseOffering { - course_reference_number: u16, - trimester: Trimester, -} - -#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, PartialOrd, Ord, Serialize)] -enum Trimester { - One, - Two, - Three, - BlockDates, - PartYear, - OneTwo, - TwoThree, - ThreeOne, - FullYear, -} - -impl TryFrom<&str> for Trimester { - type Error = String; - - fn try_from(value: &str) -> Result<Self, Self::Error> { - match value { - "1/3" => Ok(Self::One), - "2/3" => Ok(Self::Two), - "3/3" => Ok(Self::Three), - "block dates/3" => Ok(Self::BlockDates), - "part year/3" => Ok(Self::PartYear), - "1+2/3" => Ok(Self::OneTwo), - "2+3/3" => Ok(Self::TwoThree), - "3+1/3" => Ok(Self::ThreeOne), - "1+2+3/3" => Ok(Self::FullYear), - _ => Err(String::from("Invalid trimester.")), - } - } -} - +/// Utility struct for exporting to JSON. #[derive(Clone, Deserialize, Serialize)] struct JsonExport<'a> { - #[serde(borrow)] - courses: HashMap<&'a str, Course<'a>>, -} - -impl Default for Course<'_> { - fn default() -> Self { - Self { - description: Option::default(), - offered: true, - points: u8::default(), - prerequisites: Vec::default(), - restrictions: Vec::default(), - subject_areas: HashSet::default(), - subtitle: "", - timetable: Vec::default(), - title: "", - } - } -} - -impl fmt::Display for Course<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}", - self.title, - self.subtitle, - self.offered, - self.subject_areas - .iter() - // Necessary as Rust refuses to build Vec<&str> with &String. - .map(|s| &s[..]) - .collect::<Vec<&str>>() - .join(", "), - ) - } + /// [`HashMap`] of all courses. + #[serde(borrow)] + courses: HashMap<&'a str, Course<'a>>, } fn main() { - tracing_subscriber::fmt() - .with_max_level(LevelFilter::INFO) - .init(); - - let html = include_str!("../courses.html"); - - let document = Html::parse_document(html); - - let mut course_map: HashMap<&str, Course> = HashMap::new(); - - let mut subject_area = String::new(); - let mut working_course = Course::default(); - - for elem in document.select(&Selector::parse("p").unwrap()) { - let elem_value = elem.value(); - - if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { - course_map - .entry(working_course.title) - .and_modify(|c| { - c.subject_areas.insert(subject_area.clone()); - }) - .or_insert(working_course); - working_course = Course::default(); - working_course.subject_areas.insert(subject_area.clone()); - - elem.children().for_each(|child| { - child.children().for_each(|c| { - if c.value().is_text() { - let working = &c.value().as_text().unwrap()[..]; - - // Skip over space. - working_course.title = &working[..working.len() - 1]; - } else { - working_course.subtitle = &c - .first_child() - .unwrap() - .value() - .as_text() - .unwrap() - // Skip over "- ". - [4..]; - } - }); - }); - } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { - working_course.offered = false; - } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { - subject_area = elem - .first_child() - .unwrap() - .first_child() - .unwrap() - .value() - .as_text() - .unwrap() - .to_string(); - } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { - let description = elem - .first_child() - .and_then(|el| el.first_child()?.value().as_text()) - .map(|t| &t[..]); - - working_course.description = description; - - // println!("{}", working_course.description); - } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { - // Parse timetable / CRNs. - let details = elem - .first_child() - .and_then(|el| el.first_child()?.value().as_text()) - .map(|t| t.to_string()); - - if let Some(details) = details { - let details_split: Vec<&str> = details.split(" • ").take(2).collect(); - - info!("{:#?}", &details_split); - - let offering = CourseOffering { - course_reference_number: details_split.last().unwrap()[4..] - .parse::<u16>() - .unwrap(), - trimester: Trimester::try_from(*details_split.first().unwrap()).unwrap(), - }; - - working_course.timetable.push(offering); - } - } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { - // Parse course points, prerequisites, and exclusions. - let details = elem - .first_child() - .and_then(|el| el.first_child()?.value().as_text()) - .map(|t| t.to_string()); - - if let Some(details) = details { - let details_split: Vec<&str> = details.split(" • ").take(2).collect(); - - info!("{:#?}", &details_split); - - // Occasionally there is extra whitespace here, so this needs to be trimmed. - let points = details_split.first().unwrap().trim(); - debug!("{:?}", points); - - let points_slice = &points[..points.len() - 4]; - info!("{:?}", points_slice); - - let points = points_slice.parse::<u8>().unwrap(); - info!("{:?}", points); - - working_course.points = points; - - if let Some(requirements) = details_split.last().map(|s| s.trim()) { - if requirements.starts_with("(X)") { - working_course.restrictions = requirements[4..] - .split(&[';', ',']) - .map(str::trim) - .map(str::to_owned) - .collect::<Vec<String>>(); - } else { - let requirements = &requirements[4..].split(" (X) ").collect::<Vec<&str>>(); - - working_course.prerequisites = requirements - .first() - .unwrap() - .split(&[',', ';']) - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(str::to_owned) - .collect(); - - if requirements.len() > 1 { - working_course.restrictions = requirements - .last() - .unwrap() - .split(&[',', ';']) - .map(str::trim) - .map(str::to_owned) - .collect(); - } - } - - info!("{requirements}"); - } - } - } - } - - debug!("{:?}", course_map.get("COMP 102")); - - course_map.remove(""); - - fs::write( - "./export.json", - simd_json::serde::to_string(&JsonExport { - courses: course_map, - }) - .unwrap(), - ) - .unwrap(); - - // course_map - // .values() - // .for_each(|c| println!("{:#?}", c.subject_areas)); - // course_map.values().for_each(|c| println!("{c}")); + tracing_subscriber::fmt() + .with_max_level(LevelFilter::INFO) + .init(); + + let html = &fs::read_to_string("./courses.html").expect("file does not exist"); + + let document = Html::parse_document(html); + + let mut course_map: HashMap<&str, Course> = HashMap::new(); + + let mut subject_area = ""; + let mut working_course = Course::default(); + + for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) { + let elem_value = elem.value(); + + if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) { + course_map + .entry(working_course.title) + .and_modify(|c| { + c.subject_areas.insert(subject_area); + }) + .or_insert(working_course); + working_course = Course::default(); + working_course.subject_areas.insert(subject_area); + + working_course.parse_courseid(elem); + } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) { + working_course.offered = false; + } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) { + if let Some(subject_area_name) = elem.first_child().and_then(|child| { + child + .first_child() + .and_then(|nexted_child| nexted_child.value().as_text()) + }) { + subject_area = &**subject_area_name; + } + } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) { + let description = elem + .first_child() + .and_then(|el| el.first_child()?.value().as_text()) + .map(|t| &**t); + + working_course.description = description; + } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) { + working_course.parse_timetable(elem); + } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { + working_course.parse_coursepoints(elem); + } + } + + course_map.remove(""); + + fs::write( + "./export.json", + simd_json::serde::to_string(&JsonExport { + courses: course_map, + }) + .expect("json should parse correctly"), + ) + .expect("file should be writable"); } |