feat(parser): finish nom rewrite with coursepoints parser

This parser can correctly parse course prerequisites, corequisites, and restrictions, which the previous parser could not do. These cannot be split into a truly computer readable format yet, and I believe this would be out of scope for this project.
author: Sophie Forrest <git@sophieforrest.com> 2024-12-04 17:53:46 +1300
committer: Sophie Forrest <git@sophieforrest.com> 2024-12-04 17:53:46 +1300
commit: 3725fe07e58f459bb7ab9fcbc10775cf4b138ec8 (patch)
tree: e3c07e8bf7ba53a164538973787deb3e6693ea3e /src
parent: f20503aa26ec2e91fb585defa338993985dac2e5 (diff)
2 files changed, 122 insertions, 63 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 25f6ea6..ee08cbc 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,13 +9,10 @@ mod parser;
 
 use std::collections::{HashMap, HashSet};
 
-use parser::{offering, subtitle, title};
+use parser::{course_offering, offering, subtitle, title};
 use scraper::{CaseSensitivity, ElementRef, Html, Selector};
 use serde::{Deserialize, Serialize};
-use tracing::{debug, info};
-
-/// Slice used for splitting requirements for parsing.
-const SPLIT_SLICE: &[char] = &[';', ','];
+use tracing::info;
 
 /// Alias to the nom error type.
 type NomError<'a> = nom::Err<nom::error::Error<&'a str>>;
@@ -78,74 +75,38 @@ impl<'a> Course<'a> {
 
 	/// Parses the course points, prerequisites, and restrictions from the given element.
 	///
-	/// # Panics
+	/// # Errors
 	///
-	/// Panics if parsing fails, or a slice is made in the middle of a character.
-	pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) {
+	/// Panics if parsing fails.
+	pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) -> Result<(), NomError> {
 		// Parse course points, prerequisites, and exclusions.
 		let details = elem
 			.first_child()
 			.and_then(|el| el.first_child()?.value().as_text());
 
 		if let Some(details) = details {
-			let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect();
-
-			info!("{:#?}", &details_split);
-
-			// Occasionally there is extra whitespace here, so this needs to be trimmed.
-			let points = details_split.first().expect("split should exist").trim();
-			debug!("{:?}", points);
-
-			let points_slice = &points.get(..points.len() - 4).expect("should be at indice");
-			info!("{:?}", points_slice);
-
-			let points = points_slice
-				.parse::<f32>()
-				.expect("should correctly parse points");
-			info!("{:?}", points);
+			// Parse the info from our nom parser.
+			let (_, (points, requirements)) = course_offering(details)?;
 
 			self.points = points;
 
-			if let Some(requirements) = details_split.last().map(|s| s.trim()) {
-				if requirements.starts_with("(X)") {
-					self.restrictions = requirements
-						.get(4..)
-						.expect("should be at indice")
-						.split(SPLIT_SLICE)
-						.map(str::trim)
-						.collect::<Vec<&str>>();
-				} else if requirements.starts_with("(P)") {
-					let requirements = &requirements
-						.get(4..)
-						.expect("should be at indice")
-						.split(" (X) ")
-						.collect::<Vec<&str>>();
-
-					self.prerequisites = requirements
-						.first()
-						.map(|s| {
-							s.split(SPLIT_SLICE)
-								.map(str::trim)
-								.filter(|s| !s.is_empty())
-								.collect::<Vec<&str>>()
-						})
-						.unwrap_or_default();
-
-					if requirements.len() > 1 {
-						self.restrictions = requirements
-							.last()
-							.map(|s| s.split(SPLIT_SLICE).map(str::trim).collect::<Vec<&str>>())
-							.unwrap_or_default();
-					}
-				} else if details_split.len() > 1 {
-					// Prevent the points from being dumped into requirements if they're the only
-					// item.
-					self.prerequisites = vec![requirements];
+			if let Some((prerequisites, corequisites, restrictions)) = requirements {
+				// None of these are guaranteed to exist, so we need to use let Some for these.
+				if let Some(prerequisites) = prerequisites {
+					self.prerequisites.push(prerequisites);
 				}
 
-				info!("{requirements}");
+				if let Some(corequisites) = corequisites {
+					self.corequisites.push(corequisites);
+				}
+
+				if let Some(restrictions) = restrictions {
+					self.restrictions.push(restrictions);
+				}
 			}
 		}
+
+		Ok(())
 	}
 
 	/// Parses the course timetable.
@@ -310,7 +271,9 @@ pub fn parse_document(document: &Html) -> HashMap<&str, Course<'_>> {
 				.parse_timetable(elem)
 				.expect("could not parse timetable");
 		} else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) {
-			working_course.parse_coursepoints(elem);
+			working_course
+				.parse_coursepoints(elem)
+				.expect("could not parse coursepoints");
 		}
 	}
 
diff --git a/src/parser.rs b/src/parser.rs
index 87da6be..0f3ef7b 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -4,10 +4,11 @@
 
 use nom::{
 	branch::alt,
-	bytes::complete::{tag, take, take_till, take_while},
+	bytes::complete::{tag, take, take_till, take_until, take_while},
 	character::complete::{char, multispace0},
-	combinator::{map_res, rest},
-	sequence::{delimited, pair, preceded, separated_pair},
+	combinator::{map, map_res, opt, rest},
+	number::complete::float,
+	sequence::{delimited, pair, preceded, separated_pair, tuple},
 	IResult,
 };
 
@@ -101,6 +102,84 @@ pub fn subtitle(input: &str) -> IResult<&str, &str> {
 	preceded(tag("\u{2013} "), rest)(input)
 }
 
+/// Parses course prerequisites from an input.
+///
+/// # Errors
+///
+/// This function will return an error if the input is not preceded by (P).
+pub fn prerequisites(input: &str) -> IResult<&str, &str> {
+	map(
+		preceded(
+			tag("(P)"),
+			alt((take_until("(C)"), take_until("(X)"), rest)),
+		),
+		// The data will often end up with leading and trailing spaces. Trimming is the easiest
+		// way to get rid of these.
+		str::trim,
+	)(input)
+}
+
+/// Parses course corequisites from an input.
+///
+/// # Errors
+///
+/// This function will return an error if the input is not preceded by (C).
+pub fn corequisites(input: &str) -> IResult<&str, &str> {
+	map(
+		preceded(tag("(C)"), alt((take_until("(X)"), rest))),
+		// The data will often end up with leading and trailing spaces. Trimming is the easiest
+		// way to get rid of these.
+		str::trim,
+	)(input)
+}
+
+/// Parses course restrictions from an input.
+///
+/// # Errors
+///
+/// This function will return an error if the input is not preceded by (X).
+pub fn restrictions(input: &str) -> IResult<&str, &str> {
+	map(
+		preceded(tag("(X)"), rest),
+		// The data will often end up with leading and trailing spaces. Trimming is the easiest
+		// way to get rid of these.
+		str::trim,
+	)(input)
+}
+
+/// Alias for the return type of the requirements parser.
+type RequirementsReturn<'a> = (Option<&'a str>, Option<&'a str>, Option<&'a str>);
+
+/// Parses course requirements from an input.
+///
+/// # Errors
+///
+/// This function should not return an error, and errors are to be considered unreachable.
+pub fn requirements(input: &str) -> IResult<&str, RequirementsReturn> {
+	tuple((opt(prerequisites), opt(corequisites), opt(restrictions)))(input)
+}
+
+/// Parses the course points from an input.
+///
+/// # Errors
+///
+/// This function will return an error if the input provided does not contain a float.
+pub fn course_points(input: &str) -> IResult<&str, f32> {
+	float(input)
+}
+
+/// Parses the entire "coursepoints" section of the course offering.
+///
+/// # Errors
+///
+/// This function will return an error if the course points cannot be parsed.
+pub fn course_offering(input: &str) -> IResult<&str, (f32, Option<RequirementsReturn>)> {
+	tuple((
+		course_points,
+		opt(preceded(tag(" pts \u{2022} "), requirements)),
+	))(input)
+}
+
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
@@ -168,4 +247,21 @@ mod tests {
 			"Identification, Assessment and Control of Hazards and Risks"
 		);
 	}
+
+	#[test]
+	fn prereq_parser() {
+		let parsed_prereq = prerequisites("(P) LING 123 (C) SOPH 184 (X) SOPH 185").unwrap();
+
+		assert_eq!(parsed_prereq.1, "LING 123");
+	}
+
+	#[test]
+	fn req_parser() {
+		let parsed_prereq = requirements("(P) LING 229, LING 228; (C) MATH 883").unwrap();
+
+		assert_eq!(
+			parsed_prereq.1,
+			(Some("LING 229, LING 228;"), Some("MATH 883"), None)
+		);
+	}
 }
author	Sophie Forrest <git@sophieforrest.com>	2024-12-04 17:53:46 +1300
committer	Sophie Forrest <git@sophieforrest.com>	2024-12-04 17:53:46 +1300
commit	3725fe07e58f459bb7ab9fcbc10775cf4b138ec8 (patch)
tree	e3c07e8bf7ba53a164538973787deb3e6693ea3e /src
parent	f20503aa26ec2e91fb585defa338993985dac2e5 (diff)