feat(parser): speed improvement, lesson type guessing and parsing of merged lesson cabinets

This commit is contained in:
2025-05-27 02:03:54 +04:00
parent 851ec9225f
commit 01bfa38969
3 changed files with 156 additions and 134 deletions

View File

@@ -10,11 +10,12 @@ test-utils = []
calamine = "0.26"
chrono = { version = "0.4", features = ["serde"] }
derive_more = { version = "2", features = ["full"] }
sentry = "0.38"
serde = { version = "1.0.219", features = ["derive"] }
serde_repr = "0.1.20"
fuzzy-matcher = "0.3.7"
regex = "1.11.1"
utoipa = { version = "5", features = ["chrono"] }
strsim = "0.11.1"
[dev-dependencies]
criterion = "0.6"

View File

@@ -1,15 +1,13 @@
use crate::schema::internal::{BoundariesCellInfo, DayCellInfo, GroupCellInfo};
use crate::LessonParseResult::{Lessons, Street};
use crate::schema::LessonType::Break;
use crate::schema::internal::{BoundariesCellInfo, DayCellInfo, GroupCellInfo};
use crate::schema::{
Day, ErrorCell, ErrorCellPos, Lesson, LessonBoundaries, LessonSubGroup, LessonType, ParseError,
ParseResult, ScheduleEntry,
};
use crate::worksheet::WorkSheet;
use crate::LessonParseResult::{Lessons, Street};
use calamine::{open_workbook_from_rs, Reader, Xls};
use calamine::{Reader, Xls, open_workbook_from_rs};
use chrono::{DateTime, Duration, NaiveDate, NaiveTime, Utc};
use fuzzy_matcher::skim::SkimMatcherV2;
use fuzzy_matcher::FuzzyMatcher;
use regex::Regex;
use std::collections::HashMap;
use std::io::Cursor;
@@ -88,60 +86,29 @@ enum LessonParseResult {
Street(String),
}
trait StringInnerSlice {
/// Obtaining a line from the line on the initial and final index.
fn inner_slice(&self, from: usize, to: usize) -> Self;
}
impl StringInnerSlice for String {
fn inner_slice(&self, from: usize, to: usize) -> Self {
self.chars()
.take(from)
.chain(self.chars().skip(to))
.collect()
}
}
// noinspection GrazieInspection
/// Obtaining a non-standard type of lesson by name.
fn guess_lesson_type(name: &String) -> Option<(String, LessonType)> {
let map: HashMap<String, LessonType> = HashMap::from([
("(консультация)".to_string(), LessonType::Consultation),
(
"самостоятельная работа".to_string(),
LessonType::IndependentWork,
),
("зачет".to_string(), LessonType::Exam),
("зачет с оценкой".to_string(), LessonType::ExamWithGrade),
("экзамен".to_string(), LessonType::ExamDefault),
]);
fn guess_lesson_type(text: &String) -> Option<LessonType> {
static MAP: LazyLock<HashMap<&str, LessonType>> = LazyLock::new(|| {
HashMap::from([
("консультация", LessonType::Consultation),
("самостоятельная работа", LessonType::IndependentWork),
("зачет", LessonType::Exam),
("зачет с оценкой", LessonType::ExamWithGrade),
("экзамен", LessonType::ExamDefault),
])
});
let matcher = SkimMatcherV2::default();
let name_lower = name.to_lowercase();
let name_lower = text.to_lowercase();
type SearchResult<'a> = (&'a LessonType, i64, Vec<usize>);
let mut search_results: Vec<SearchResult> = map
match MAP
.iter()
.map(|entry| -> SearchResult {
if let Some((score, indices)) = matcher.fuzzy_indices(&*name_lower, entry.0) {
return (entry.1, score, indices);
}
(entry.1, 0, Vec::new())
})
.collect();
search_results.sort_by(|a, b| b.1.cmp(&a.1));
let guessed_type = search_results.first().unwrap();
if guessed_type.1 > 80 {
Some((
name.inner_slice(guessed_type.2[0], guessed_type.2[guessed_type.2.len() - 1]),
guessed_type.0.clone(),
))
} else {
None
.map(|(text, lesson_type)| (lesson_type, strsim::levenshtein(text, &*name_lower)))
.filter(|x| x.1 <= 4)
.min_by_key(|(_, score)| *score)
{
None => None,
Some(v) => Some(v.0.clone()),
}
}
@@ -155,28 +122,25 @@ fn parse_lesson(
) -> Result<LessonParseResult, ParseError> {
let row = lesson_boundaries.xls_range.0.0;
let (name, lesson_type) = {
let full_name = match worksheet.get_string_from_cell(row, group_column) {
let name = {
let cell_data = match worksheet.get_string_from_cell(row, group_column) {
Some(x) => x,
None => return Ok(Lessons(Vec::new())),
};
static OTHER_STREET_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+,?\s?[0-9]+$").unwrap());
LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+[,\s]\d+$").unwrap());
if OTHER_STREET_RE.is_match(&full_name) {
return Ok(Street(full_name));
if OTHER_STREET_RE.is_match(&cell_data) {
return Ok(Street(cell_data));
}
match guess_lesson_type(&full_name) {
Some(x) => x,
None => (full_name, lesson_boundaries.lesson_type.clone()),
}
cell_data
};
let (default_range, lesson_time) = {
let cell_range = worksheet.get_merge_from_start(row, group_column);
let cell_range = worksheet.get_merge_from_start(row, group_column);
let (default_range, lesson_time) = {
let end_time_arr = day_boundaries
.iter()
.filter(|time| time.xls_range.1.0 == cell_range.1.0)
@@ -205,10 +169,17 @@ fn parse_lesson(
Ok((range, time))
}?;
let (name, mut subgroups) = parse_name_and_subgroups(&name)?;
let (name, mut subgroups, lesson_type) = parse_name_and_subgroups(&name)?;
{
let cabinets: Vec<String> = parse_cabinets(worksheet, row, group_column + 1);
let cabinets: Vec<String> = parse_cabinets(
worksheet,
(
cell_range.0.0,
cell_range.1.0,
),
group_column + 1,
);
match cabinets.len() {
// Если кабинетов нет, но есть подгруппы, назначаем им кабинет "??"
@@ -254,7 +225,7 @@ fn parse_lesson(
};
let lesson = Lesson {
lesson_type,
lesson_type: lesson_type.unwrap_or(lesson_boundaries.lesson_type.clone()),
default_range,
name: Some(name),
time: lesson_time,
@@ -285,10 +256,12 @@ fn parse_lesson(
}
/// Obtaining a list of cabinets to the right of the lesson cell.
fn parse_cabinets(worksheet: &WorkSheet, row: u32, column: u32) -> Vec<String> {
fn parse_cabinets(worksheet: &WorkSheet, row_range: (u32, u32), column: u32) -> Vec<String> {
let mut cabinets: Vec<String> = Vec::new();
if let Some(raw) = worksheet.get_string_from_cell(row, column) {
for row in row_range.0..row_range.1 {
let raw = or_continue!(worksheet.get_string_from_cell(row, column));
let clean = raw.replace("\n", " ");
let parts: Vec<&str> = clean.split(" ").collect();
@@ -297,59 +270,117 @@ fn parse_cabinets(worksheet: &WorkSheet, row: u32, column: u32) -> Vec<String> {
cabinets.push(clean_part);
}
break;
}
cabinets
}
//noinspection GrazieInspection
/// Getting the "pure" name of the lesson and list of teachers from the text of the lesson cell.
fn parse_name_and_subgroups(name: &String) -> Result<(String, Vec<LessonSubGroup>), ParseError> {
static LESSON_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?:[А-Я][а-я]+[А-Я]{2}(?:\([0-9][а-я]+\))?)+$").unwrap());
static TEACHER_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"([А-Я][а-я]+)([А-Я])([А-Я])(?:\(([0-9])[а-я]+\))?").unwrap());
static CLEAN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\s.,]+").unwrap());
static END_CLEAN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[.\s]+$").unwrap());
fn parse_name_and_subgroups(
text: &String,
) -> Result<(String, Vec<LessonSubGroup>, Option<LessonType>), ParseError> {
// Части названия пары:
// 1. Само название.
// 2. Список преподавателей и подгрупп.
// 3. "Модификатор" (чаще всего).
//
// Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части).
// (?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(\d\s?[а-я]+\))?(?:, )?)+[\s.]*
//
// Подробнее:
// (?:
// [А-Я][а-я]+ - Фамилия.
// \s? - Кто знает, будет ли там пробел.
// (?:[А-Я][\s.]*){2} - Имя и отчество с учётом случайных пробелов и точек.
// (?:
// \( - Открытие подгруппы.
// \s? - Кто знает, будет ли там пробел.
// \d - Номер подгруппы.
// \s? - Кто знает, будет ли там пробел.
// [а-я\s]+ - Слово "подгруппа" с учётов ошибок.
// \) - Закрытие подгруппы.
// )? - Явное указание подгруппы может отсутствовать по понятным причинам.
// (?:, )? - Разделители между отдельными частями.
// )+
// [\s.]* - Забираем с собой всякий мусор, что бы не передать его в третью часть.
let (teachers, lesson_name) = {
let clean_name = CLEAN_RE.replace_all(&name, "").to_string();
static NAMES_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(\s*\d\s*[а-я\s]+\))?(?:[\s,]+)?)+[\s.]*",
)
.unwrap()
});
if let Some(captures) = LESSON_RE.captures(&clean_name) {
// Отчистка
static CLEAN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\s\n\t]+").unwrap());
let text = CLEAN_RE
.replace(&text.replace(&[' ', '\t', '\n'], " "), " ")
.to_string();
let (lesson_name, mut subgroups, lesson_type) = match NAMES_REGEX.captures(&text) {
Some(captures) => {
let capture = captures.get(0).unwrap();
let capture_str = capture.as_str().to_string();
let capture_name: String = capture_str.chars().take(5).collect();
(
END_CLEAN_RE.replace(&capture_str, "").to_string(),
END_CLEAN_RE
.replace(&name[0..name.find(&*capture_name).unwrap()], "")
.to_string(),
)
} else {
return Ok((END_CLEAN_RE.replace(&name, "").to_string(), Vec::new()));
let subgroups: Vec<LessonSubGroup> = {
let src = capture.as_str().replace(&[' ', '.'], "");
src.split(',')
.map(|name| {
let open_bracket_index = name.find('(');
let subgroup_number = open_bracket_index.map_or(0, |index| {
name[(index + 1)..(index + 2)].parse::<u8>().unwrap()
});
let teacher_name = {
let name_end = open_bracket_index.unwrap_or_else(|| name.len());
// Я ебал. Как же я долго до этого доходил.
format!(
"{} {}.{}.",
name.get(..name_end - 4).unwrap(),
name.get(name_end - 4..name_end - 2).unwrap(),
name.get(name_end - 2..name_end).unwrap(),
)
};
LessonSubGroup {
number: subgroup_number,
cabinet: None,
teacher: teacher_name,
}
})
.collect()
};
let name = text[..capture.start()].trim().to_string();
let extra = text[capture.end()..].trim().to_string();
let lesson_type = if extra.len() > 4 {
let result = guess_lesson_type(&extra);
#[cfg(not(debug_assertions))]
if result.is_none() {
sentry::capture_message(
&*format!("Не удалось угадать тип пары '{}'!", extra),
sentry::Level::Warning,
);
}
result
} else {
None
};
(name, subgroups, lesson_type)
}
None => (text, Vec::new(), None),
};
let mut subgroups: Vec<LessonSubGroup> = Vec::new();
let teacher_it = TEACHER_RE.captures_iter(&teachers);
for captures in teacher_it {
subgroups.push(LessonSubGroup {
number: match captures.get(4) {
Some(capture) => capture.as_str().to_string().parse::<u8>().unwrap(),
None => 0,
},
cabinet: None,
teacher: format!(
"{} {}.{}.",
captures.get(1).unwrap().as_str().to_string(),
captures.get(2).unwrap().as_str().to_string(),
captures.get(3).unwrap().as_str().to_string()
),
});
}
// фикс, если у кого-то отсутствует индекс подгруппы
if subgroups.len() == 1 {
@@ -384,7 +415,7 @@ fn parse_name_and_subgroups(name: &String) -> Result<(String, Vec<LessonSubGroup
subgroups.reverse()
}
Ok((lesson_name, subgroups))
Ok((lesson_name, subgroups, lesson_type))
}
/// Getting the start and end of a pair from a cell in the first column of a document.
@@ -719,9 +750,17 @@ pub mod tests {
assert!(result.groups.contains_key("ИС-214/23"));
let group = result.groups.get("ИС-214/23").unwrap();
let thursday = group.days.get(3).unwrap();
let thursday = group.days.get(3).unwrap();
assert_eq!(thursday.lessons.len(), 1);
assert_eq!(thursday.lessons[0].default_range.unwrap()[1], 3);
let lesson = &thursday.lessons[0];
assert_eq!(lesson.default_range.unwrap()[1], 3);
assert!(lesson.subgroups.is_some());
let subgroups = lesson.subgroups.as_ref().unwrap();
assert_eq!(subgroups.len(), 2);
assert_eq!(subgroups[0].cabinet, Some("44".to_string()));
assert_eq!(subgroups[1].cabinet, Some("43".to_string()));
}
}