refactor: move parsing into separate function

Preparation for adding download capabilities.
author: Sophie Forrest <git@sophieforrest.com> 2024-09-10 08:53:41 +1200
committer: Sophie Forrest <git@sophieforrest.com> 2024-09-10 08:53:41 +1200
commit: 08d060660f2f6bd678770026e109e0cd7429e6f1 (patch)
tree: 0769a5b614be7c1b5a1a3c44da7c00999c759f63 /src/lib.rs
parent: d0256057ac4e2d18f30a7cd5845d315f3167ac67 (diff)
1 files changed, 57 insertions, 17 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 38af605..3dc8e99 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,9 +5,12 @@
 //! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly
 //! parse prerequisites, however.
 
-use std::{collections::HashSet, fmt};
+use std::{
+	collections::{HashMap, HashSet},
+	fmt,
+};
 
-use scraper::ElementRef;
+use scraper::{CaseSensitivity, ElementRef, Html, Selector};
 use serde::{Deserialize, Serialize};
 use tracing::{debug, info};
 
@@ -269,20 +272,57 @@ impl TryFrom<&str> for Trimester {
 	}
 }
 
-impl fmt::Display for Course<'_> {
-	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-		write!(
-			f,
-			"Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}",
-			self.title,
-			self.subtitle,
-			self.offered,
-			self.subject_areas
-                .iter()
-                // Necessary as Rust refuses to build Vec<&str> with &String.
-                .map(|s| &**s)
-                .collect::<Vec<&str>>()
-                .join(", "),
-		)
+/// Parses a [`Html`] document into a [`HashMap`] of courses.
+///
+/// # Panics
+///
+/// Panics if [`Selector`] fails to parse.
+#[must_use]
+pub fn parse_document(document: &Html) -> HashMap<&str, Course<'_>> {
+	let mut course_map: HashMap<&str, Course> = HashMap::new();
+
+	let mut subject_area = "";
+	let mut working_course = Course::default();
+
+	for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) {
+		let elem_value = elem.value();
+
+		if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) {
+			course_map
+				.entry(working_course.title)
+				.and_modify(|c| {
+					c.subject_areas.insert(subject_area);
+				})
+				.or_insert(working_course);
+			working_course = Course::default();
+			working_course.subject_areas.insert(subject_area);
+
+			working_course.parse_courseid(elem);
+		} else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) {
+			working_course.offered = false;
+		} else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) {
+			if let Some(subject_area_name) = elem.first_child().and_then(|child| {
+				child
+					.first_child()
+					.and_then(|nexted_child| nexted_child.value().as_text())
+			}) {
+				subject_area = &**subject_area_name;
+			}
+		} else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) {
+			let description = elem
+				.first_child()
+				.and_then(|el| el.first_child()?.value().as_text())
+				.map(|t| &**t);
+
+			working_course.description = description;
+		} else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) {
+			working_course.parse_timetable(elem);
+		} else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) {
+			working_course.parse_coursepoints(elem);
+		}
 	}
+
+	course_map.remove("");
+
+	course_map
 }
author	Sophie Forrest <git@sophieforrest.com>	2024-09-10 08:53:41 +1200
committer	Sophie Forrest <git@sophieforrest.com>	2024-09-10 08:53:41 +1200
commit	08d060660f2f6bd678770026e109e0cd7429e6f1 (patch)
tree	0769a5b614be7c1b5a1a3c44da7c00999c759f63 /src/lib.rs
parent	d0256057ac4e2d18f30a7cd5845d315f3167ac67 (diff)