diff --git a/Cargo.lock b/Cargo.lock index 70effc3..e639135 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -652,6 +652,21 @@ dependencies = [ "serde", ] +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.9.4" @@ -825,9 +840,9 @@ dependencies = [ [[package]] name = "calamine" -version = "0.30.1" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a9acfeb1555aa8def91fe8ff208aadaea850c109968ec35ac965edbe7d210b" +checksum = "da56b262e8a827c6b12c3dde4ea4622e0ff542bd2e9ea5855e4cb523481d77b7" dependencies = [ "atoi_simd", "byteorder 1.5.0", @@ -835,7 +850,7 @@ dependencies = [ "encoding_rs", "fast-float2", "log", - "quick-xml 0.37.5", + "quick-xml", "serde", "zip", ] @@ -1463,6 +1478,17 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "fancy-regex" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fast-float2" version = "0.2.3" @@ -2923,7 +2949,7 @@ checksum = "740ebea15c5d1428f910cd1a5f52cebf8d25006245ed8ade92702f4943d91e07" dependencies = [ "base64 0.22.1", "indexmap 2.11.4", - "quick-xml 0.38.3", + "quick-xml", "serde", "time 0.3.44", ] @@ -3075,6 +3101,7 @@ dependencies = [ "calamine", "chrono", "derive_more", + "fancy-regex", "log", "regex", "reqwest", @@ -3121,22 +3148,13 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" -[[package]] -name = "quick-xml" -version = "0.37.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" -dependencies = [ - "encoding_rs", - "memchr", -] - [[package]] name = "quick-xml" version = "0.38.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89" dependencies = [ + "encoding_rs", "memchr", ] diff --git a/providers/provider-engels-polytechnic/Cargo.toml b/providers/provider-engels-polytechnic/Cargo.toml index d8a4b11..1c0190e 100644 --- a/providers/provider-engels-polytechnic/Cargo.toml +++ b/providers/provider-engels-polytechnic/Cargo.toml @@ -20,7 +20,7 @@ derive_more = { version = "2.0.1", features = ["error", "display"] } utoipa = { version = "5.4.0", features = ["macros", "chrono"] } -calamine = "0.30" +calamine = "0.31" async-trait = "0.1.89" reqwest = "0.12.23" @@ -29,4 +29,5 @@ regex = "1.11.2" strsim = "0.11.1" log = "0.4.27" sentry = "0.43.0" +fancy-regex = "0.16.2" diff --git a/providers/provider-engels-polytechnic/src/parser/mod.rs b/providers/provider-engels-polytechnic/src/parser/mod.rs index 07b3335..6bcb1b5 100644 --- a/providers/provider-engels-polytechnic/src/parser/mod.rs +++ b/providers/provider-engels-polytechnic/src/parser/mod.rs @@ -233,6 +233,7 @@ enum LessonParseResult { fn guess_lesson_type(text: &str) -> Option { static MAP: LazyLock> = LazyLock::new(|| { HashMap::from([ + ("о важном", LessonType::Additional), ("консультация", LessonType::Consultation), ("самостоятельная работа", LessonType::IndependentWork), ("зачет", LessonType::Exam), @@ -427,127 +428,128 @@ fn parse_name_and_subgroups(text: &str) -> Result { // 3. "Модификатор" (чаще всего). // // Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части). - // (?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(\d\s?[а-я]+\))?(?:, )?)+[\s.]* - // - // Подробнее: - // (?: - // [А-Я][а-я]+ - Фамилия. - // \s? - Кто знает, будет ли там пробел. - // (?:[А-Я][\s.]*){2} - Имя и отчество с учётом случайных пробелов и точек. - // (?: - // \( - Открытие подгруппы. - // \s? - Кто знает, будет ли там пробел. - // \d - Номер подгруппы. - // \s? - Кто знает, будет ли там пробел. - // [а-я\s]+ - Слово "подгруппа" с учётов ошибок. - // \) - Закрытие подгруппы. - // )? - Явное указание подгруппы может отсутствовать по понятным причинам. - // (?:, )? - Разделители между отдельными частями. - // )+ - // [\s.]* - Забираем с собой всякий мусор, что бы не передать его в третью часть. - - static NAMES_REGEX: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(?\s*\d\s*[а-я\s]+\)?)?(?:[\s,.]+)?){1,2}+[\s.,]*", + static NAME_RE: LazyLock = LazyLock::new(|| { + fancy_regex::Regex::new( + r"([А-Я][а-я]+(?:[\s.]*[А-Я]){1,2})(?=[^а-я])[.\s]*(?:\(?(\d)[\sа-я]*\)?)?", ) - .unwrap() + .unwrap() }); - // Отчистка - static CLEAN_RE: LazyLock = LazyLock::new(|| Regex::new(r"[\s\n\t]+").unwrap()); + let text = text + .chars() + .filter(|c: &char| { + c.is_whitespace() + || c.is_ascii_digit() + || (*c >= 'а' && *c <= 'я') + || (*c >= 'А' && *c <= 'Я') + || *c == '.' + || *c == '-' + }) + .collect::() + .replace(r"\s+", " "); - let text = CLEAN_RE - .replace(&text.replace([' ', '\t', '\n'], " ").replace(",", ""), " ") - .to_string(); + let mut lesson_name: Option<&str> = None; + let mut extra: Option<&str> = None; - let (lesson_name, subgroups, lesson_type) = match NAMES_REGEX.captures(&text) { - Some(captures) => { - let capture = captures.get(0).unwrap(); + let mut shared_subgroup = false; + let mut subgroups: [Option; 2] = [None, None]; - let subgroups: Vec> = { - let src = capture.as_str().replace([' ', '.'], ""); + for capture in NAME_RE.captures_iter(&text) { + let capture = capture.unwrap(); - let mut shared_subgroup = false; - let mut subgroups: [Option; 2] = [None, None]; - - for name in src.split(',') { - let digit_index = name.find(|c: char| c.is_ascii_digit()); - - let number: u8 = - digit_index.map_or(0, |index| name[(index)..(index + 1)].parse().unwrap()); - - let teacher_name = { - let name_end = name - .find(|c: char| !c.is_alphabetic()) - .unwrap_or(name.len()); - - // Я ебал. Как же я долго до этого доходил. - format!( - "{} {}.{}.", - name.get(..name_end - 4).unwrap(), - name.get(name_end - 4..name_end - 2).unwrap(), - name.get(name_end - 2..name_end).unwrap(), - ) - }; - - let lesson = Some(LessonSubGroup { - cabinet: None, - teacher: Some(teacher_name), - }); - - match number { - 0 => { - subgroups[0] = lesson; - subgroups[1] = None; - shared_subgroup = true; - break; - } - num => { - // 1 - 1 = 0 | 2 - 1 = 1 | 3 - 1 = 2 (schedule index to array index) - // 0 % 2 = 0 | 1 % 2 = 1 | 2 % 2 = 0 (clamp) - let normalised = (num - 1) % 2; - - subgroups[normalised as usize] = lesson; - } - } - } - - if shared_subgroup { - Vec::from([subgroups[0].take()]) - } else { - Vec::from(subgroups) - } - }; - - let name = text[..capture.start()].trim().to_string(); - let extra = text[capture.end()..].trim().to_string(); - - let lesson_type = if extra.len() > 4 { - let result = guess_lesson_type(&extra); - - if result.is_none() { - #[cfg(not(debug_assertions))] - sentry::capture_message( - &format!("Не удалось угадать тип пары '{}'!", extra), - sentry::Level::Warning, - ); - - #[cfg(debug_assertions)] - log::warn!("Не удалось угадать тип пары '{}'!", extra); - } - - result - } else { - None - }; - - (name, subgroups, lesson_type) + if lesson_name.is_none() { + lesson_name = Some(&text[..capture.get(0).unwrap().start()]); } - None => (text, Vec::new(), None), + + extra = Some(&text[capture.get(0).unwrap().end()..]); + + let teacher_name = { + let clean = capture + .get(1) + .unwrap() + .as_str() + .chars() + .filter(|c| c.is_alphabetic()) + .collect::>(); + + if clean.get(clean.len() - 2).is_some_and(|c| c.is_uppercase()) { + let (name, remaining) = clean.split_at(clean.len() - 2); + format!( + "{} {}.{}.", + name.iter().collect::(), + remaining[0], + remaining[1] + ) + } else { + let (remaining, name) = clean.split_last().unwrap(); + format!("{} {}.", name.iter().collect::(), remaining) + } + }; + + let subgroup_index = capture + .get(2) + .and_then(|m| Some(m.as_str().parse::().unwrap())); + + let subgroup = Some(LessonSubGroup { + cabinet: None, + teacher: Some(teacher_name), + }); + + match subgroup_index { + None => { + subgroups[0] = subgroup; + subgroups[1] = None; + shared_subgroup = true; + break; + } + Some(num) => { + // 1 - 1 = 0 | 2 - 1 = 1 | 3 - 1 = 2 (schedule index to array index) + // 0 % 2 = 0 | 1 % 2 = 1 | 2 % 2 = 0 (clamp) + let normalised = (num - 1) % 2; + + subgroups[normalised as usize] = subgroup; + } + } + } + + let subgroups = if lesson_name.is_none() { + Vec::new() + } else if shared_subgroup { + Vec::from([subgroups[0].take()]) + } else { + Vec::from(subgroups) + }; + + if extra.is_none() { + extra = text + .rfind(" ") + .and_then(|i| text[..i].rfind(" ")) + .map(|i| &text[i + 1..]); + } + + let lesson_type = if let Some(extra) = extra + && extra.len() > 4 + { + let result = guess_lesson_type(&extra); + + if result.is_none() { + #[cfg(not(debug_assertions))] + sentry::capture_message( + &format!("Не удалось угадать тип пары '{}'!", extra), + sentry::Level::Warning, + ); + + #[cfg(debug_assertions)] + log::warn!("Не удалось угадать тип пары '{}'!", extra); + } + + result + } else { + None }; Ok(ParsedLessonName { - name: lesson_name, + name: lesson_name.unwrap_or(&text).to_string(), subgroups, r#type: lesson_type, })