From 01bfa389692e3b7ae8b048828cbfc5be19dfdbd0 Mon Sep 17 00:00:00 2001 From: n08i40k Date: Tue, 27 May 2025 02:03:54 +0400 Subject: [PATCH] feat(parser): speed improvement, lesson type guessing and parsing of merged lesson cabinets --- Cargo.lock | 22 +-- schedule-parser/Cargo.toml | 3 +- schedule-parser/src/lib.rs | 265 +++++++++++++++++++++---------------- 3 files changed, 156 insertions(+), 134 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 23800b3..6919e87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1258,15 +1258,6 @@ dependencies = [ "slab", ] -[[package]] -name = "fuzzy-matcher" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54614a3312934d066701a80f20f15fa3b56d67ac7722b39eea5b4c9dd1d66c94" -dependencies = [ - "thread_local", -] - [[package]] name = "gcc" version = "0.3.55" @@ -2881,10 +2872,11 @@ dependencies = [ "chrono", "criterion", "derive_more", - "fuzzy-matcher", "regex", + "sentry", "serde", "serde_repr", + "strsim", "utoipa", ] @@ -3383,16 +3375,6 @@ dependencies = [ "syn 2.0.100", ] -[[package]] -name = "thread_local" -version = "1.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" -dependencies = [ - "cfg-if", - "once_cell", -] - [[package]] name = "time" version = "0.1.45" diff --git a/schedule-parser/Cargo.toml b/schedule-parser/Cargo.toml index 381f74f..61da019 100644 --- a/schedule-parser/Cargo.toml +++ b/schedule-parser/Cargo.toml @@ -10,11 +10,12 @@ test-utils = [] calamine = "0.26" chrono = { version = "0.4", features = ["serde"] } derive_more = { version = "2", features = ["full"] } +sentry = "0.38" serde = { version = "1.0.219", features = ["derive"] } serde_repr = "0.1.20" -fuzzy-matcher = "0.3.7" regex = "1.11.1" utoipa = { version = "5", features = ["chrono"] } +strsim = "0.11.1" [dev-dependencies] criterion = "0.6" diff --git a/schedule-parser/src/lib.rs b/schedule-parser/src/lib.rs index a37379a..df35529 100644 --- a/schedule-parser/src/lib.rs +++ b/schedule-parser/src/lib.rs @@ -1,15 +1,13 @@ -use crate::schema::internal::{BoundariesCellInfo, DayCellInfo, GroupCellInfo}; +use crate::LessonParseResult::{Lessons, Street}; use crate::schema::LessonType::Break; +use crate::schema::internal::{BoundariesCellInfo, DayCellInfo, GroupCellInfo}; use crate::schema::{ Day, ErrorCell, ErrorCellPos, Lesson, LessonBoundaries, LessonSubGroup, LessonType, ParseError, ParseResult, ScheduleEntry, }; use crate::worksheet::WorkSheet; -use crate::LessonParseResult::{Lessons, Street}; -use calamine::{open_workbook_from_rs, Reader, Xls}; +use calamine::{Reader, Xls, open_workbook_from_rs}; use chrono::{DateTime, Duration, NaiveDate, NaiveTime, Utc}; -use fuzzy_matcher::skim::SkimMatcherV2; -use fuzzy_matcher::FuzzyMatcher; use regex::Regex; use std::collections::HashMap; use std::io::Cursor; @@ -88,60 +86,29 @@ enum LessonParseResult { Street(String), } -trait StringInnerSlice { - /// Obtaining a line from the line on the initial and final index. - fn inner_slice(&self, from: usize, to: usize) -> Self; -} - -impl StringInnerSlice for String { - fn inner_slice(&self, from: usize, to: usize) -> Self { - self.chars() - .take(from) - .chain(self.chars().skip(to)) - .collect() - } -} - // noinspection GrazieInspection /// Obtaining a non-standard type of lesson by name. -fn guess_lesson_type(name: &String) -> Option<(String, LessonType)> { - let map: HashMap = HashMap::from([ - ("(консультация)".to_string(), LessonType::Consultation), - ( - "самостоятельная работа".to_string(), - LessonType::IndependentWork, - ), - ("зачет".to_string(), LessonType::Exam), - ("зачет с оценкой".to_string(), LessonType::ExamWithGrade), - ("экзамен".to_string(), LessonType::ExamDefault), - ]); +fn guess_lesson_type(text: &String) -> Option { + static MAP: LazyLock> = LazyLock::new(|| { + HashMap::from([ + ("консультация", LessonType::Consultation), + ("самостоятельная работа", LessonType::IndependentWork), + ("зачет", LessonType::Exam), + ("зачет с оценкой", LessonType::ExamWithGrade), + ("экзамен", LessonType::ExamDefault), + ]) + }); - let matcher = SkimMatcherV2::default(); - let name_lower = name.to_lowercase(); + let name_lower = text.to_lowercase(); - type SearchResult<'a> = (&'a LessonType, i64, Vec); - - let mut search_results: Vec = map + match MAP .iter() - .map(|entry| -> SearchResult { - if let Some((score, indices)) = matcher.fuzzy_indices(&*name_lower, entry.0) { - return (entry.1, score, indices); - } - - (entry.1, 0, Vec::new()) - }) - .collect(); - search_results.sort_by(|a, b| b.1.cmp(&a.1)); - - let guessed_type = search_results.first().unwrap(); - - if guessed_type.1 > 80 { - Some(( - name.inner_slice(guessed_type.2[0], guessed_type.2[guessed_type.2.len() - 1]), - guessed_type.0.clone(), - )) - } else { - None + .map(|(text, lesson_type)| (lesson_type, strsim::levenshtein(text, &*name_lower))) + .filter(|x| x.1 <= 4) + .min_by_key(|(_, score)| *score) + { + None => None, + Some(v) => Some(v.0.clone()), } } @@ -155,28 +122,25 @@ fn parse_lesson( ) -> Result { let row = lesson_boundaries.xls_range.0.0; - let (name, lesson_type) = { - let full_name = match worksheet.get_string_from_cell(row, group_column) { + let name = { + let cell_data = match worksheet.get_string_from_cell(row, group_column) { Some(x) => x, None => return Ok(Lessons(Vec::new())), }; static OTHER_STREET_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+,?\s?[0-9]+$").unwrap()); + LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+[,\s]\d+$").unwrap()); - if OTHER_STREET_RE.is_match(&full_name) { - return Ok(Street(full_name)); + if OTHER_STREET_RE.is_match(&cell_data) { + return Ok(Street(cell_data)); } - match guess_lesson_type(&full_name) { - Some(x) => x, - None => (full_name, lesson_boundaries.lesson_type.clone()), - } + cell_data }; - let (default_range, lesson_time) = { - let cell_range = worksheet.get_merge_from_start(row, group_column); + let cell_range = worksheet.get_merge_from_start(row, group_column); + let (default_range, lesson_time) = { let end_time_arr = day_boundaries .iter() .filter(|time| time.xls_range.1.0 == cell_range.1.0) @@ -205,10 +169,17 @@ fn parse_lesson( Ok((range, time)) }?; - let (name, mut subgroups) = parse_name_and_subgroups(&name)?; + let (name, mut subgroups, lesson_type) = parse_name_and_subgroups(&name)?; { - let cabinets: Vec = parse_cabinets(worksheet, row, group_column + 1); + let cabinets: Vec = parse_cabinets( + worksheet, + ( + cell_range.0.0, + cell_range.1.0, + ), + group_column + 1, + ); match cabinets.len() { // Если кабинетов нет, но есть подгруппы, назначаем им кабинет "??" @@ -254,7 +225,7 @@ fn parse_lesson( }; let lesson = Lesson { - lesson_type, + lesson_type: lesson_type.unwrap_or(lesson_boundaries.lesson_type.clone()), default_range, name: Some(name), time: lesson_time, @@ -285,10 +256,12 @@ fn parse_lesson( } /// Obtaining a list of cabinets to the right of the lesson cell. -fn parse_cabinets(worksheet: &WorkSheet, row: u32, column: u32) -> Vec { +fn parse_cabinets(worksheet: &WorkSheet, row_range: (u32, u32), column: u32) -> Vec { let mut cabinets: Vec = Vec::new(); - if let Some(raw) = worksheet.get_string_from_cell(row, column) { + for row in row_range.0..row_range.1 { + let raw = or_continue!(worksheet.get_string_from_cell(row, column)); + let clean = raw.replace("\n", " "); let parts: Vec<&str> = clean.split(" ").collect(); @@ -297,59 +270,117 @@ fn parse_cabinets(worksheet: &WorkSheet, row: u32, column: u32) -> Vec { cabinets.push(clean_part); } + + break; } cabinets } +//noinspection GrazieInspection /// Getting the "pure" name of the lesson and list of teachers from the text of the lesson cell. -fn parse_name_and_subgroups(name: &String) -> Result<(String, Vec), ParseError> { - static LESSON_RE: LazyLock = - LazyLock::new(|| Regex::new(r"(?:[А-Я][а-я]+[А-Я]{2}(?:\([0-9][а-я]+\))?)+$").unwrap()); - static TEACHER_RE: LazyLock = - LazyLock::new(|| Regex::new(r"([А-Я][а-я]+)([А-Я])([А-Я])(?:\(([0-9])[а-я]+\))?").unwrap()); - static CLEAN_RE: LazyLock = LazyLock::new(|| Regex::new(r"[\s.,]+").unwrap()); - static END_CLEAN_RE: LazyLock = LazyLock::new(|| Regex::new(r"[.\s]+$").unwrap()); +fn parse_name_and_subgroups( + text: &String, +) -> Result<(String, Vec, Option), ParseError> { + // Части названия пары: + // 1. Само название. + // 2. Список преподавателей и подгрупп. + // 3. "Модификатор" (чаще всего). + // + // Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части). + // (?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(\d\s?[а-я]+\))?(?:, )?)+[\s.]* + // + // Подробнее: + // (?: + // [А-Я][а-я]+ - Фамилия. + // \s? - Кто знает, будет ли там пробел. + // (?:[А-Я][\s.]*){2} - Имя и отчество с учётом случайных пробелов и точек. + // (?: + // \( - Открытие подгруппы. + // \s? - Кто знает, будет ли там пробел. + // \d - Номер подгруппы. + // \s? - Кто знает, будет ли там пробел. + // [а-я\s]+ - Слово "подгруппа" с учётов ошибок. + // \) - Закрытие подгруппы. + // )? - Явное указание подгруппы может отсутствовать по понятным причинам. + // (?:, )? - Разделители между отдельными частями. + // )+ + // [\s.]* - Забираем с собой всякий мусор, что бы не передать его в третью часть. - let (teachers, lesson_name) = { - let clean_name = CLEAN_RE.replace_all(&name, "").to_string(); + static NAMES_REGEX: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(\s*\d\s*[а-я\s]+\))?(?:[\s,]+)?)+[\s.]*", + ) + .unwrap() + }); - if let Some(captures) = LESSON_RE.captures(&clean_name) { + // Отчистка + static CLEAN_RE: LazyLock = LazyLock::new(|| Regex::new(r"[\s\n\t]+").unwrap()); + + let text = CLEAN_RE + .replace(&text.replace(&[' ', '\t', '\n'], " "), " ") + .to_string(); + + let (lesson_name, mut subgroups, lesson_type) = match NAMES_REGEX.captures(&text) { + Some(captures) => { let capture = captures.get(0).unwrap(); - let capture_str = capture.as_str().to_string(); - let capture_name: String = capture_str.chars().take(5).collect(); - ( - END_CLEAN_RE.replace(&capture_str, "").to_string(), - END_CLEAN_RE - .replace(&name[0..name.find(&*capture_name).unwrap()], "") - .to_string(), - ) - } else { - return Ok((END_CLEAN_RE.replace(&name, "").to_string(), Vec::new())); + let subgroups: Vec = { + let src = capture.as_str().replace(&[' ', '.'], ""); + + src.split(',') + .map(|name| { + let open_bracket_index = name.find('('); + + let subgroup_number = open_bracket_index.map_or(0, |index| { + name[(index + 1)..(index + 2)].parse::().unwrap() + }); + + let teacher_name = { + let name_end = open_bracket_index.unwrap_or_else(|| name.len()); + + // Я ебал. Как же я долго до этого доходил. + format!( + "{} {}.{}.", + name.get(..name_end - 4).unwrap(), + name.get(name_end - 4..name_end - 2).unwrap(), + name.get(name_end - 2..name_end).unwrap(), + ) + }; + + LessonSubGroup { + number: subgroup_number, + cabinet: None, + teacher: teacher_name, + } + }) + .collect() + }; + + let name = text[..capture.start()].trim().to_string(); + let extra = text[capture.end()..].trim().to_string(); + + let lesson_type = if extra.len() > 4 { + let result = guess_lesson_type(&extra); + + #[cfg(not(debug_assertions))] + if result.is_none() { + sentry::capture_message( + &*format!("Не удалось угадать тип пары '{}'!", extra), + sentry::Level::Warning, + ); + } + + result + } else { + None + }; + + (name, subgroups, lesson_type) } + None => (text, Vec::new(), None), }; - let mut subgroups: Vec = Vec::new(); - - let teacher_it = TEACHER_RE.captures_iter(&teachers); - - for captures in teacher_it { - subgroups.push(LessonSubGroup { - number: match captures.get(4) { - Some(capture) => capture.as_str().to_string().parse::().unwrap(), - None => 0, - }, - cabinet: None, - teacher: format!( - "{} {}.{}.", - captures.get(1).unwrap().as_str().to_string(), - captures.get(2).unwrap().as_str().to_string(), - captures.get(3).unwrap().as_str().to_string() - ), - }); - } - // фикс, если у кого-то отсутствует индекс подгруппы if subgroups.len() == 1 { @@ -384,7 +415,7 @@ fn parse_name_and_subgroups(name: &String) -> Result<(String, Vec