summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib.rs287
-rw-r--r--src/main.rs331
2 files changed, 360 insertions, 258 deletions
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..6fbf7dc
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: AGPL-3.0-or-later
+
+//! # VUW Course scraper
+//!
+//! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly
+//! parse prerequisites, however.
+
+use std::{collections::HashSet, fmt};
+
+use scraper::ElementRef;
+use serde::{Deserialize, Serialize};
+use tracing::{debug, info};
+
+/// Slice used for splitting requirements for parsing.
+const SPLIT_SLICE: &[char] = &[';', ','];
+
+/// A VUW course, along with all relevant data.
+#[derive(Clone, Debug, Deserialize, Serialize)]
+#[non_exhaustive]
+pub struct Course<'a> {
+	/// Description of the course.
+	pub description: Option<&'a str>,
+
+	/// Whether this course is offered in the upcoming year.
+	pub offered: bool,
+
+	/// Amount of points this course is worth.
+	pub points: u8,
+
+	/// Courses that must be taken before this course.
+	pub prerequisites: Vec<&'a str>,
+
+	/// Courses that cannot be taken if you take this course.
+	pub restrictions: Vec<&'a str>,
+
+	/// Subject areas this course belongs to.
+	pub subject_areas: HashSet<&'a str>,
+
+	/// Subtitle of this course, its longer name.
+	pub subtitle: &'a str,
+
+	/// Timetable of this course, includes trimesters and CRNs.
+	pub timetable: Vec<CourseOffering>,
+
+	/// Title of this course, also known as the code.
+	pub title: &'a str,
+}
+
+impl<'a> Course<'a> {
+	/// .
+	///
+	/// # Panics
+	///
+	/// Panics if .
+	pub fn parse_courseid(&mut self, elem: ElementRef<'a>) {
+		elem.children().for_each(|child| {
+			child.children().for_each(|c| {
+				if let Some(text) = c.value().as_text() {
+					// The actual text we're looking for
+					let text: &str = text.trim();
+
+					self.title = text;
+				} else if let Some(text) = c
+					.first_child()
+					.and_then(|node| node.value().as_text().map(|text| &**text))
+				{
+					if let Some((indice, _char)) = text.char_indices().nth(1) {
+						// Skip over "-"
+						self.subtitle = text.get(indice..).expect("indice should be valid").trim();
+					}
+				}
+			});
+		});
+	}
+
+	/// Parses the course points, prerequisites, and restrictions from the given element.
+	///
+	/// # Panics
+	///
+	/// Panics if .
+	pub fn parse_coursepoints(&mut self, elem: ElementRef<'a>) {
+		// Parse course points, prerequisites, and exclusions.
+		let details = elem
+			.first_child()
+			.and_then(|el| el.first_child()?.value().as_text());
+
+		if let Some(details) = details {
+			let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect();
+
+			info!("{:#?}", &details_split);
+
+			// Occasionally there is extra whitespace here, so this needs to be trimmed.
+			let points = details_split.first().expect("split should exist").trim();
+			debug!("{:?}", points);
+
+			let points_slice = &points.get(..points.len() - 4).expect("should be at indice");
+			info!("{:?}", points_slice);
+
+			let points = points_slice
+				.parse::<u8>()
+				.expect("should correctly parse points");
+			info!("{:?}", points);
+
+			self.points = points;
+
+			if let Some(requirements) = details_split.last().map(|s| s.trim()) {
+				if requirements.starts_with("(X)") {
+					self.restrictions = requirements
+						.get(4..)
+						.expect("should be at indice")
+						.split(SPLIT_SLICE)
+						.map(str::trim)
+						.collect::<Vec<&str>>();
+				} else if requirements.starts_with("(P)") {
+					let requirements = &requirements
+						.get(4..)
+						.expect("should be at indice")
+						.split(" (X) ")
+						.collect::<Vec<&str>>();
+
+					self.prerequisites = requirements
+						.first()
+						.map(|s| {
+							s.split(SPLIT_SLICE)
+								.map(str::trim)
+								.filter(|s| !s.is_empty())
+								.collect::<Vec<&str>>()
+						})
+						.unwrap_or_default();
+
+					if requirements.len() > 1 {
+						self.restrictions = requirements
+							.last()
+							.map(|s| s.split(SPLIT_SLICE).map(str::trim).collect::<Vec<&str>>())
+							.unwrap_or_default();
+					}
+				} else {
+					self.prerequisites = vec![requirements];
+				}
+
+				info!("{requirements}");
+			}
+		}
+	}
+
+	/// .
+	///
+	/// # Panics
+	///
+	/// Panics if .
+	pub fn parse_timetable(&mut self, elem: ElementRef<'a>) {
+		// Parse timetable / CRNs.
+		let details = elem
+			.first_child()
+			.and_then(|el| el.first_child()?.value().as_text());
+
+		if let Some(details) = details {
+			let details_split: Vec<&str> = details.split(" \u{2022} ").take(2).collect();
+
+			info!("{:#?}", &details_split);
+
+			let offering = CourseOffering::new(
+				details_split
+					.last()
+					.expect("course reference number should exist")
+					.get(4..)
+					.expect("course reference number digits should start at this indice")
+					.parse::<u16>()
+					.expect("course reference number should be parseable"),
+				Trimester::try_from(
+					*details_split
+						.first()
+						.expect("trimester element should exist"),
+				)
+				.expect("should be parseable into a trimester"),
+			);
+
+			self.timetable.push(offering);
+		}
+	}
+}
+
+impl Default for Course<'_> {
+	fn default() -> Self {
+		Self {
+			description: Option::default(),
+			offered: true,
+			points: u8::default(),
+			prerequisites: Vec::default(),
+			restrictions: Vec::default(),
+			subject_areas: HashSet::default(),
+			subtitle: "",
+			timetable: Vec::default(),
+			title: "",
+		}
+	}
+}
+
+/// A course offering, includes the CRN and [`Trimester`].
+#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
+#[non_exhaustive]
+pub struct CourseOffering {
+	/// Reference number for this coursem e.g. 11723.
+	pub course_reference_number: u16,
+
+	/// Trimester this course is offered in.
+	pub trimester: Trimester,
+}
+
+impl CourseOffering {
+	/// Creates a new [`CourseOffering`].
+	#[must_use]
+	pub const fn new(course_reference_number: u16, trimester: Trimester) -> Self {
+		Self {
+			course_reference_number,
+			trimester,
+		}
+	}
+}
+
+/// Trimester information Victoria University of Wellington offers.
+#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, PartialOrd, Ord, Serialize)]
+pub enum Trimester {
+	/// Trimester one.
+	One,
+
+	/// Trimester two.
+	Two,
+
+	/// Trimester three.
+	Three,
+
+	/// Block dates. See course page for more information.
+	BlockDates,
+
+	/// Part year. See course page for more information.
+	PartYear,
+
+	/// Trimesters one and two.
+	OneTwo,
+
+	/// Trimesters two and three.
+	TwoThree,
+
+	/// Trimesters three and one.
+	ThreeOne,
+
+	/// Trimesters one, two, and three.
+	FullYear,
+}
+
+impl TryFrom<&str> for Trimester {
+	type Error = String;
+
+	fn try_from(value: &str) -> Result<Self, Self::Error> {
+		match value {
+			"1/3" => Ok(Self::One),
+			"2/3" => Ok(Self::Two),
+			"3/3" => Ok(Self::Three),
+			"block dates/3" => Ok(Self::BlockDates),
+			"part year/3" => Ok(Self::PartYear),
+			"1+2/3" => Ok(Self::OneTwo),
+			"2+3/3" => Ok(Self::TwoThree),
+			"3+1/3" => Ok(Self::ThreeOne),
+			"1+2+3/3" => Ok(Self::FullYear),
+			_ => Err(String::from("Invalid trimester.")),
+		}
+	}
+}
+
+impl fmt::Display for Course<'_> {
+	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+		write!(
+			f,
+			"Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}",
+			self.title,
+			self.subtitle,
+			self.offered,
+			self.subject_areas
+                .iter()
+                // Necessary as Rust refuses to build Vec<&str> with &String.
+                .map(|s| &**s)
+                .collect::<Vec<&str>>()
+                .join(", "),
+		)
+	}
+}
diff --git a/src/main.rs b/src/main.rs
index ee7686e..05f3976 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,270 +1,85 @@
 // SPDX-License-Identifier: AGPL-3.0-or-later
 
-use serde::{Deserialize, Serialize};
-use std::{
-    collections::{HashMap, HashSet},
-    fmt, fs,
-};
+//! # VUW Course scraper
+//!
+//! This is a simple program capable of parsing VUWs courses from the registry. It cannot correctly
+//! parse prerequisites, however.
 
-use tracing::{debug, info, level_filters::LevelFilter};
+use std::{collections::HashMap, fs};
 
 use scraper::{CaseSensitivity, Html, Selector};
+use serde::{Deserialize, Serialize};
+use tracing::level_filters::LevelFilter;
+use vuw_course_scraper::Course;
 
-// TODO: Use string slices to avoid clones?
-#[derive(Clone, Debug, Deserialize, Serialize)]
-struct Course<'a> {
-    description: Option<&'a str>,
-    offered: bool,
-    points: u8,
-    prerequisites: Vec<String>,
-    restrictions: Vec<String>,
-    subject_areas: HashSet<String>,
-    subtitle: &'a str,
-    timetable: Vec<CourseOffering>,
-    title: &'a str,
-}
-
-#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
-struct CourseOffering {
-    course_reference_number: u16,
-    trimester: Trimester,
-}
-
-#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, PartialOrd, Ord, Serialize)]
-enum Trimester {
-    One,
-    Two,
-    Three,
-    BlockDates,
-    PartYear,
-    OneTwo,
-    TwoThree,
-    ThreeOne,
-    FullYear,
-}
-
-impl TryFrom<&str> for Trimester {
-    type Error = String;
-
-    fn try_from(value: &str) -> Result<Self, Self::Error> {
-        match value {
-            "1/3" => Ok(Self::One),
-            "2/3" => Ok(Self::Two),
-            "3/3" => Ok(Self::Three),
-            "block dates/3" => Ok(Self::BlockDates),
-            "part year/3" => Ok(Self::PartYear),
-            "1+2/3" => Ok(Self::OneTwo),
-            "2+3/3" => Ok(Self::TwoThree),
-            "3+1/3" => Ok(Self::ThreeOne),
-            "1+2+3/3" => Ok(Self::FullYear),
-            _ => Err(String::from("Invalid trimester.")),
-        }
-    }
-}
-
+/// Utility struct for exporting to JSON.
 #[derive(Clone, Deserialize, Serialize)]
 struct JsonExport<'a> {
-    #[serde(borrow)]
-    courses: HashMap<&'a str, Course<'a>>,
-}
-
-impl Default for Course<'_> {
-    fn default() -> Self {
-        Self {
-            description: Option::default(),
-            offered: true,
-            points: u8::default(),
-            prerequisites: Vec::default(),
-            restrictions: Vec::default(),
-            subject_areas: HashSet::default(),
-            subtitle: "",
-            timetable: Vec::default(),
-            title: "",
-        }
-    }
-}
-
-impl fmt::Display for Course<'_> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "Course {{ title: {}, subtitle: {}, offered: {}, areas: [{}] }}",
-            self.title,
-            self.subtitle,
-            self.offered,
-            self.subject_areas
-                .iter()
-                // Necessary as Rust refuses to build Vec<&str> with &String.
-                .map(|s| &s[..])
-                .collect::<Vec<&str>>()
-                .join(", "),
-        )
-    }
+	/// [`HashMap`] of all courses.
+	#[serde(borrow)]
+	courses: HashMap<&'a str, Course<'a>>,
 }
 
 fn main() {
-    tracing_subscriber::fmt()
-        .with_max_level(LevelFilter::INFO)
-        .init();
-
-    let html = include_str!("../courses.html");
-
-    let document = Html::parse_document(html);
-
-    let mut course_map: HashMap<&str, Course> = HashMap::new();
-
-    let mut subject_area = String::new();
-    let mut working_course = Course::default();
-
-    for elem in document.select(&Selector::parse("p").unwrap()) {
-        let elem_value = elem.value();
-
-        if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) {
-            course_map
-                .entry(working_course.title)
-                .and_modify(|c| {
-                    c.subject_areas.insert(subject_area.clone());
-                })
-                .or_insert(working_course);
-            working_course = Course::default();
-            working_course.subject_areas.insert(subject_area.clone());
-
-            elem.children().for_each(|child| {
-                child.children().for_each(|c| {
-                    if c.value().is_text() {
-                        let working = &c.value().as_text().unwrap()[..];
-
-                        // Skip over space.
-                        working_course.title = &working[..working.len() - 1];
-                    } else {
-                        working_course.subtitle = &c
-                            .first_child()
-                            .unwrap()
-                            .value()
-                            .as_text()
-                            .unwrap()
-                            // Skip over "- ".
-                            [4..];
-                    }
-                });
-            });
-        } else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) {
-            working_course.offered = false;
-        } else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) {
-            subject_area = elem
-                .first_child()
-                .unwrap()
-                .first_child()
-                .unwrap()
-                .value()
-                .as_text()
-                .unwrap()
-                .to_string();
-        } else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) {
-            let description = elem
-                .first_child()
-                .and_then(|el| el.first_child()?.value().as_text())
-                .map(|t| &t[..]);
-
-            working_course.description = description;
-
-            // println!("{}", working_course.description);
-        } else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) {
-            // Parse timetable / CRNs.
-            let details = elem
-                .first_child()
-                .and_then(|el| el.first_child()?.value().as_text())
-                .map(|t| t.to_string());
-
-            if let Some(details) = details {
-                let details_split: Vec<&str> = details.split(" • ").take(2).collect();
-
-                info!("{:#?}", &details_split);
-
-                let offering = CourseOffering {
-                    course_reference_number: details_split.last().unwrap()[4..]
-                        .parse::<u16>()
-                        .unwrap(),
-                    trimester: Trimester::try_from(*details_split.first().unwrap()).unwrap(),
-                };
-
-                working_course.timetable.push(offering);
-            }
-        } else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) {
-            // Parse course points, prerequisites, and exclusions.
-            let details = elem
-                .first_child()
-                .and_then(|el| el.first_child()?.value().as_text())
-                .map(|t| t.to_string());
-
-            if let Some(details) = details {
-                let details_split: Vec<&str> = details.split(" • ").take(2).collect();
-
-                info!("{:#?}", &details_split);
-
-                // Occasionally there is extra whitespace here, so this needs to be trimmed.
-                let points = details_split.first().unwrap().trim();
-                debug!("{:?}", points);
-
-                let points_slice = &points[..points.len() - 4];
-                info!("{:?}", points_slice);
-
-                let points = points_slice.parse::<u8>().unwrap();
-                info!("{:?}", points);
-
-                working_course.points = points;
-
-                if let Some(requirements) = details_split.last().map(|s| s.trim()) {
-                    if requirements.starts_with("(X)") {
-                        working_course.restrictions = requirements[4..]
-                            .split(&[';', ','])
-                            .map(str::trim)
-                            .map(str::to_owned)
-                            .collect::<Vec<String>>();
-                    } else {
-                        let requirements = &requirements[4..].split(" (X) ").collect::<Vec<&str>>();
-
-                        working_course.prerequisites = requirements
-                            .first()
-                            .unwrap()
-                            .split(&[',', ';'])
-                            .map(str::trim)
-                            .filter(|s| !s.is_empty())
-                            .map(str::to_owned)
-                            .collect();
-
-                        if requirements.len() > 1 {
-                            working_course.restrictions = requirements
-                                .last()
-                                .unwrap()
-                                .split(&[',', ';'])
-                                .map(str::trim)
-                                .map(str::to_owned)
-                                .collect();
-                        }
-                    }
-
-                    info!("{requirements}");
-                }
-            }
-        }
-    }
-
-    debug!("{:?}", course_map.get("COMP 102"));
-
-    course_map.remove("");
-
-    fs::write(
-        "./export.json",
-        simd_json::serde::to_string(&JsonExport {
-            courses: course_map,
-        })
-        .unwrap(),
-    )
-    .unwrap();
-
-    // course_map
-    //     .values()
-    //     .for_each(|c| println!("{:#?}", c.subject_areas));
-    // course_map.values().for_each(|c| println!("{c}"));
+	tracing_subscriber::fmt()
+		.with_max_level(LevelFilter::INFO)
+		.init();
+
+	let html = &fs::read_to_string("./courses.html").expect("file does not exist");
+
+	let document = Html::parse_document(html);
+
+	let mut course_map: HashMap<&str, Course> = HashMap::new();
+
+	let mut subject_area = "";
+	let mut working_course = Course::default();
+
+	for elem in document.select(&Selector::parse("p").expect("selector should always be valid")) {
+		let elem_value = elem.value();
+
+		if elem_value.has_class("courseid", CaseSensitivity::AsciiCaseInsensitive) {
+			course_map
+				.entry(working_course.title)
+				.and_modify(|c| {
+					c.subject_areas.insert(subject_area);
+				})
+				.or_insert(working_course);
+			working_course = Course::default();
+			working_course.subject_areas.insert(subject_area);
+
+			working_course.parse_courseid(elem);
+		} else if elem_value.has_class("notoffered", CaseSensitivity::CaseSensitive) {
+			working_course.offered = false;
+		} else if elem_value.has_class("subjectarea", CaseSensitivity::CaseSensitive) {
+			if let Some(subject_area_name) = elem.first_child().and_then(|child| {
+				child
+					.first_child()
+					.and_then(|nexted_child| nexted_child.value().as_text())
+			}) {
+				subject_area = &**subject_area_name;
+			}
+		} else if elem_value.has_class("subjectsbody", CaseSensitivity::CaseSensitive) {
+			let description = elem
+				.first_child()
+				.and_then(|el| el.first_child()?.value().as_text())
+				.map(|t| &**t);
+
+			working_course.description = description;
+		} else if elem_value.has_class("timetable", CaseSensitivity::CaseSensitive) {
+			working_course.parse_timetable(elem);
+		} else if elem_value.has_class("coursepoints", CaseSensitivity::CaseSensitive) {
+			working_course.parse_coursepoints(elem);
+		}
+	}
+
+	course_map.remove("");
+
+	fs::write(
+		"./export.json",
+		simd_json::serde::to_string(&JsonExport {
+			courses: course_map,
+		})
+		.expect("json should parse correctly"),
+	)
+	.expect("file should be writable");
 }