diff options
| author | Sophie Forrest <git@sophieforrest.com> | 2024-12-04 17:53:46 +1300 |
|---|---|---|
| committer | Sophie Forrest <git@sophieforrest.com> | 2024-12-04 17:53:46 +1300 |
| commit | 3725fe07e58f459bb7ab9fcbc10775cf4b138ec8 (patch) | |
| tree | e3c07e8bf7ba53a164538973787deb3e6693ea3e /src | |
| parent | f20503aa26ec2e91fb585defa338993985dac2e5 (diff) | |
feat(parser): finish nom rewrite with coursepoints parser
This parser can correctly parse course prerequisites, corequisites, and restrictions, which the previous parser could not do. These cannot be split into a truly computer readable format yet, and I believe this would be out of scope for this project.
Diffstat (limited to '')
| -rw-r--r-- | src/lib.rs | 83 | ||||
| -rw-r--r-- | src/parser.rs | 102 |
2 files changed, 122 insertions, 63 deletions
diff --git a/src/lib.rs b/src/lib.rs index 25f6ea6..ee08cbc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,13 +9,10 @@ mod parser; use std::collections::{HashMap, HashSet}; -use parser::{offering, subtitle, title}; +use parser::{course_offering, offering, subtitle, title}; use scraper::{CaseSensitivity, ElementRef, Html, Selector}; use serde::{Deserialize, Serialize}; -use tracing::{debug, info}; - -/// Slice used for splitting requirements for parsing. -const SPLIT_SLICE: &[char] = &[';', ',']; +use tracing::info; /// Alias to the nom error type. type NomError<'a> = nom::Err<nom::error::Error<&'a str>>; @@ -78,74 +75,38 @@ impl<'a> Course<'a> { /// Parses the course points, prerequisites, and restrictions from the given element. /// - /// # Panics + /// # Errors /// - /// Panics if parsing fails, or a slice is made in the middle of a character. - pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) { + /// Panics if parsing fails. + pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) -> Result<(), NomError> { // Parse course points, prerequisites, and exclusions. let details = elem .first_child() .and_then(|el| el.first_child()?.value().as_text()); if let Some(details) = details { - let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect(); - - info!("{:#?}", &details_split); - - // Occasionally there is extra whitespace here, so this needs to be trimmed. - let points = details_split.first().expect("split should exist").trim(); - debug!("{:?}", points); - - let points_slice = &points.get(..points.len() - 4).expect("should be at indice"); - info!("{:?}", points_slice); - - let points = points_slice - .parse::<f32>() - .expect("should correctly parse points"); - info!("{:?}", points); + // Parse the info from our nom parser. + let (_, (points, requirements)) = course_offering(details)?; self.points = points; - if let Some(requirements) = details_split.last().map(|s| s.trim()) { - if requirements.starts_with("(X)") { - self.restrictions = requirements - .get(4..) - .expect("should be at indice") - .split(SPLIT_SLICE) - .map(str::trim) - .collect::<Vec<&str>>(); - } else if requirements.starts_with("(P)") { - let requirements = &requirements - .get(4..) - .expect("should be at indice") - .split(" (X) ") - .collect::<Vec<&str>>(); - - self.prerequisites = requirements - .first() - .map(|s| { - s.split(SPLIT_SLICE) - .map(str::trim) - .filter(|s| !s.is_empty()) - .collect::<Vec<&str>>() - }) - .unwrap_or_default(); - - if requirements.len() > 1 { - self.restrictions = requirements - .last() - .map(|s| s.split(SPLIT_SLICE).map(str::trim).collect::<Vec<&str>>()) - .unwrap_or_default(); - } - } else if details_split.len() > 1 { - // Prevent the points from being dumped into requirements if they're the only - // item. - self.prerequisites = vec![requirements]; + if let Some((prerequisites, corequisites, restrictions)) = requirements { + // None of these are guaranteed to exist, so we need to use let Some for these. + if let Some(prerequisites) = prerequisites { + self.prerequisites.push(prerequisites); } - info!("{requirements}"); + if let Some(corequisites) = corequisites { + self.corequisites.push(corequisites); + } + + if let Some(restrictions) = restrictions { + self.restrictions.push(restrictions); + } } } + + Ok(()) } /// Parses the course timetable. @@ -310,7 +271,9 @@ pub fn parse_document(document: &Html) -> HashMap<&str, Course<'_>> { .parse_timetable(elem) .expect("could not parse timetable"); } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) { - working_course.parse_coursepoints(elem); + working_course + .parse_coursepoints(elem) + .expect("could not parse coursepoints"); } } diff --git a/src/parser.rs b/src/parser.rs index 87da6be..0f3ef7b 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -4,10 +4,11 @@ use nom::{ branch::alt, - bytes::complete::{tag, take, take_till, take_while}, + bytes::complete::{tag, take, take_till, take_until, take_while}, character::complete::{char, multispace0}, - combinator::{map_res, rest}, - sequence::{delimited, pair, preceded, separated_pair}, + combinator::{map, map_res, opt, rest}, + number::complete::float, + sequence::{delimited, pair, preceded, separated_pair, tuple}, IResult, }; @@ -101,6 +102,84 @@ pub fn subtitle(input: &str) -> IResult<&str, &str> { preceded(tag("\u{2013} "), rest)(input) } +/// Parses course prerequisites from an input. +/// +/// # Errors +/// +/// This function will return an error if the input is not preceded by (P). +pub fn prerequisites(input: &str) -> IResult<&str, &str> { + map( + preceded( + tag("(P)"), + alt((take_until("(C)"), take_until("(X)"), rest)), + ), + // The data will often end up with leading and trailing spaces. Trimming is the easiest + // way to get rid of these. + str::trim, + )(input) +} + +/// Parses course corequisites from an input. +/// +/// # Errors +/// +/// This function will return an error if the input is not preceded by (C). +pub fn corequisites(input: &str) -> IResult<&str, &str> { + map( + preceded(tag("(C)"), alt((take_until("(X)"), rest))), + // The data will often end up with leading and trailing spaces. Trimming is the easiest + // way to get rid of these. + str::trim, + )(input) +} + +/// Parses course restrictions from an input. +/// +/// # Errors +/// +/// This function will return an error if the input is not preceded by (X). +pub fn restrictions(input: &str) -> IResult<&str, &str> { + map( + preceded(tag("(X)"), rest), + // The data will often end up with leading and trailing spaces. Trimming is the easiest + // way to get rid of these. + str::trim, + )(input) +} + +/// Alias for the return type of the requirements parser. +type RequirementsReturn<'a> = (Option<&'a str>, Option<&'a str>, Option<&'a str>); + +/// Parses course requirements from an input. +/// +/// # Errors +/// +/// This function should not return an error, and errors are to be considered unreachable. +pub fn requirements(input: &str) -> IResult<&str, RequirementsReturn> { + tuple((opt(prerequisites), opt(corequisites), opt(restrictions)))(input) +} + +/// Parses the course points from an input. +/// +/// # Errors +/// +/// This function will return an error if the input provided does not contain a float. +pub fn course_points(input: &str) -> IResult<&str, f32> { + float(input) +} + +/// Parses the entire "coursepoints" section of the course offering. +/// +/// # Errors +/// +/// This function will return an error if the course points cannot be parsed. +pub fn course_offering(input: &str) -> IResult<&str, (f32, Option<RequirementsReturn>)> { + tuple(( + course_points, + opt(preceded(tag(" pts \u{2022} "), requirements)), + ))(input) +} + #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { @@ -168,4 +247,21 @@ mod tests { "Identification, Assessment and Control of Hazards and Risks" ); } + + #[test] + fn prereq_parser() { + let parsed_prereq = prerequisites("(P) LING 123 (C) SOPH 184 (X) SOPH 185").unwrap(); + + assert_eq!(parsed_prereq.1, "LING 123"); + } + + #[test] + fn req_parser() { + let parsed_prereq = requirements("(P) LING 229, LING 228; (C) MATH 883").unwrap(); + + assert_eq!( + parsed_prereq.1, + (Some("LING 229, LING 228;"), Some("MATH 883"), None) + ); + } } |