fix(parser): fix sentry error sending

feat(parser): limit names regex to maximum 2 elements
This allows us to not worry about subgroups array index overflows, and we can make better non-standard case solving.
2025-12-06 09:47:50 +03:00 · 2025-10-10 03:00:47 +04:00 · 2025-10-10 01:39:54 +04:00 · 2025-10-10 01:37:52 +04:00 · 2025-10-10 01:31:55 +04:00 · 2025-10-10 01:30:56 +04:00
4 changed files with 73 additions and 113 deletions
--- a/providers/base/src/lib.rs
+++ b/providers/base/src/lib.rs
@@ -102,7 +102,7 @@ pub enum LessonType {
    CourseProjectDefense,
    /// Практическое занятие.
-    Practice
+    Practice,
 }
 #[derive(Clone, Hash, Debug, Serialize, Deserialize, ToSchema)]
@@ -212,70 +212,6 @@ impl ScheduleSnapshot {
    }
 }
 // #[derive(Clone, Debug, Display, Error, ToSchema)]
 // #[display("row {row}, column {column}")]
 // pub struct ErrorCellPos {
 //     pub row: u32,
 //     pub column: u32,
 // }
 //
 // #[derive(Clone, Debug, Display, Error, ToSchema)]
 // #[display("'{data}' at {pos}")]
 // pub struct ErrorCell {
 //     pub pos: ErrorCellPos,
 //     pub data: String,
 // }
 //
 // impl ErrorCell {
 //     pub fn new(row: u32, column: u32, data: String) -> Self {
 //         Self {
 //             pos: ErrorCellPos { row, column },
 //             data,
 //         }
 //     }
 // }
 // #[derive(Clone, Debug, Display, Error, ToSchema)]
 // pub enum ParseError {
 //     /// Errors related to reading XLS file.
 //     #[display("{_0:?}: Failed to read XLS file.")]
 //     #[schema(value_type = String)]
 //     BadXLS(Arc<calamine::XlsError>),
 //
 //     /// Not a single sheet was found.
 //     #[display("No work sheets found.")]
 //     NoWorkSheets,
 //
 //     /// There are no data on the boundaries of the sheet.
 //     #[display("There is no data on work sheet boundaries.")]
 //     UnknownWorkSheetRange,
 //
 //     /// Failed to read the beginning and end of the lesson from the cell
 //     #[display("Failed to read lesson start and end from {_0}.")]
 //     LessonBoundaries(ErrorCell),
 //
 //     /// Not found the beginning and the end corresponding to the lesson.
 //     #[display("No start and end times matching the lesson (at {_0}) was found.")]
 //     LessonTimeNotFound(ErrorCellPos),
 // }
 //
 // impl Serialize for ParseError {
 //     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 //     where
 //         S: Serializer,
 //     {
 //         match self {
 //             ParseError::BadXLS(_) => serializer.serialize_str("BAD_XLS"),
 //             ParseError::NoWorkSheets => serializer.serialize_str("NO_WORK_SHEETS"),
 //             ParseError::UnknownWorkSheetRange => {
 //                 serializer.serialize_str("UNKNOWN_WORK_SHEET_RANGE")
 //             }
 //             ParseError::LessonBoundaries(_) => serializer.serialize_str("GLOBAL_TIME"),
 //             ParseError::LessonTimeNotFound(_) => serializer.serialize_str("LESSON_TIME_NOT_FOUND"),
 //         }
 //     }
 // }
 #[async_trait]
 pub trait ScheduleProvider
 where
--- a/providers/provider-engels-polytechnic/src/parser/error.rs
+++ b/providers/provider-engels-polytechnic/src/parser/error.rs
@@ -1,21 +1,5 @@
 use derive_more::{Display, Error, From};
 use crate::parser::worksheet::CellPos;
-
+use derive_more::{Display, Error, From};
 #[derive(Clone, Debug, Display, Error)]
 #[display("'{data}' at {pos}")]
 pub struct ErrorCell {
    pub pos: CellPos,
    pub data: String,
 }
 impl ErrorCell {
    pub fn new(row: u32, column: u32, data: &str) -> Self {
        Self {
            pos: CellPos { row, column },
            data: data.to_string(),
        }
    }
 }
 #[derive(Debug, Display, Error, From)]
 pub enum Error {
@@ -28,11 +12,14 @@ pub enum Error {
    #[display("There is no data on work sheet boundaries.")]
    UnknownWorkSheetRange,
-    #[display("Failed to read lesson start and end from {_0}.")]
+    #[display("Failed to read lesson start and end of lesson at {_0}.")]
-    NoLessonBoundaries(ErrorCell),
+    NoLessonBoundaries(CellPos),
    #[display("No start and end times matching the lesson (at {_0}) was found.")]
    LessonTimeNotFound(CellPos),
    #[display("Unknown lesson type `{type}` at {pos}")]
    UnknownLessonType { pos: CellPos, r#type: String },
 }
 pub type Result<T> = core::result::Result<T, Error>;
--- a/providers/provider-engels-polytechnic/src/parser/mod.rs
+++ b/providers/provider-engels-polytechnic/src/parser/mod.rs
@@ -1,6 +1,5 @@
 pub use self::error::{Error, Result};
 use crate::or_continue;
 use crate::parser::error::ErrorCell;
 use crate::parser::worksheet::{CellPos, CellRange, WorkSheet};
 use crate::parser::LessonParseResult::{Lessons, Street};
 use base::LessonType::Break;
@@ -217,7 +216,7 @@ fn parse_lesson(
        };
        static OTHER_STREET_RE: LazyLock<Regex> =
-            LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+[,\s]\d+$").unwrap());
+            LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+[,\s]+д\.\s\d+$").unwrap());
        if OTHER_STREET_RE.is_match(&cell_data) {
            return Ok(Street(cell_data));
@@ -226,12 +225,17 @@ fn parse_lesson(
        cell_data
    };
-    let cell_range = worksheet.get_merge_from_start(row, group_column);
+    let lesson_cell_range = worksheet.get_merge_from_start(row, group_column);
    let (default_range, lesson_time) = {
        let end_time_arr = day_boundaries
            .iter()
-            .filter(|time| time.range.end.row == cell_range.end.row)
+            .filter(
                |BoundariesData {
                     range: CellRange { end, .. },
                     ..
                 }| { lesson_cell_range.end.row <= end.row },
            )
            .collect::<Vec<&BoundariesData>>();
        let end_time = end_time_arr
@@ -257,12 +261,12 @@ fn parse_lesson(
        name,
        mut subgroups,
        r#type: lesson_type,
-    } = parse_name_and_subgroups(&name)?;
+    } = parse_name_and_subgroups(&name, row, group_column)?;
    {
        let cabinets: Vec<String> = parse_cabinets(
            worksheet,
-            (cell_range.start.row, cell_range.end.row),
+            (lesson_cell_range.start.row, lesson_cell_range.end.row),
            group_column + 1,
        );
@@ -364,7 +368,7 @@ struct ParsedLessonName {
 //noinspection GrazieInspection
 /// Getting the "pure" name of the lesson and list of teachers from the text of the lesson cell.
-fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
+fn parse_name_and_subgroups(text: &str, row: u32, column: u32) -> Result<ParsedLessonName> {
    // Части названия пары:
    // 1. Само название.
    // 2. Список преподавателей и подгрупп.
@@ -373,7 +377,7 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
    // Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части).
    static NAME_RE: LazyLock<fancy_regex::Regex> = LazyLock::new(|| {
        fancy_regex::Regex::new(
-            r"([А-Я][а-я]+(?:[\s.]*[А-Я]){1,2})(?=[^а-я])[.\s]*(?:\(?(\d)[\sа-я]*\)?)?",
+            r"([А-Я][а-я]+(?:[\s.]*[А-Я]){1,2})(?=[^А-Яа-я])[.\s]*(?:\(?(\d)[\sа-я]*\)?)?",
        )
        .unwrap()
    });
@@ -394,10 +398,10 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
    let mut lesson_name: Option<&str> = None;
    let mut extra: Option<&str> = None;
-    let mut shared_subgroup = false;
+    let mut shared_subgroup = true;
    let mut subgroups: [Option<LessonSubGroup>; 2] = [None, None];
-    for capture in NAME_RE.captures_iter(&text) {
+    for capture in NAME_RE.captures_iter(&text).take(2) {
        let capture = capture.unwrap();
        if lesson_name.is_none() {
@@ -438,17 +442,23 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
        match subgroup_index {
            None => {
-                subgroups[0] = subgroup;
+                // we have only 2 matches max so more than 2 subgroups we cant have 100%
-                subgroups[1] = None;
+                *subgroups.iter_mut().find(|x| x.is_none()).unwrap() = subgroup;
                shared_subgroup = true;
                break;
            }
            Some(num) => {
                // bc we have indexed subgroup
                shared_subgroup = false;
                // 1 - 1 = 0 | 2 - 1 = 1 | 3 - 1 = 2 (schedule index to array index)
                // 0 % 2 = 0 | 1 % 2 = 1 | 2 % 2 = 0 (clamp)
-                let normalised = (num - 1) % 2;
+                let subgroup_index = ((num - 1) % 2) as usize;
-                subgroups[normalised as usize] = subgroup;
+                // if we have subgroup in that index (probably non-indexed, we change it index to free)
                if subgroups[subgroup_index].is_some() {
                    subgroups.swap(0, 1);
                }
                subgroups[subgroup_index] = subgroup;
            }
        }
    }
@@ -456,7 +466,7 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
    let subgroups = if lesson_name.is_none() {
        Vec::new()
    } else if shared_subgroup {
-        Vec::from([subgroups[0].take()])
+        Vec::from([subgroups.into_iter().next().unwrap()])
    } else {
        Vec::from(subgroups)
    };
@@ -475,13 +485,19 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
        if result.is_none() {
            #[cfg(not(debug_assertions))]
-            sentry::capture_message(
+            sentry::capture_error(&Error::UnknownLessonType {
-                &format!("Не удалось угадать тип пары '{}'!", extra),
+                r#type: extra.to_string(),
-                sentry::Level::Warning,
+                pos: CellPos::new(row, column),
-            );
+            });
            #[cfg(debug_assertions)]
-            log::warn!("Не удалось угадать тип пары '{}'!", extra);
+            log::warn!(
                "{}",
                Error::UnknownLessonType {
                    r#type: extra.to_string(),
                    pos: CellPos::new(row, column),
                }
            );
        }
        result
@@ -548,9 +564,8 @@ fn parse_day_boundaries(
            continue;
        };
-        let lesson_time = parse_lesson_boundaries_cell(&time_cell, date).ok_or(
+        let lesson_time = parse_lesson_boundaries_cell(&time_cell, date)
-            Error::NoLessonBoundaries(ErrorCell::new(row, column, &time_cell)),
+            .ok_or(Error::NoLessonBoundaries(CellPos::new(row, column)))?;
        )?;
        // type
        let lesson_type = if time_cell.contains("пара") {
--- a/providers/provider-engels-polytechnic/src/parser/worksheet.rs
+++ b/providers/provider-engels-polytechnic/src/parser/worksheet.rs
@@ -1,5 +1,5 @@
 use derive_more::Display;
 use regex::Regex;
 use std::fmt::{Display, Formatter};
 use std::ops::Deref;
 use std::sync::LazyLock;
@@ -9,13 +9,35 @@ pub struct WorkSheet {
    pub merges: Vec<calamine::Dimensions>,
 }
-#[derive(Clone, Debug, Display, derive_more::Error)]
+#[derive(Clone, Debug, derive_more::Error)]
 #[display("row {row}, column {column}")]
 pub struct CellPos {
    pub row: u32,
    pub column: u32,
 }
 fn format_column_index(index: u32) -> String {
    // https://stackoverflow.com/a/297214
    let quotient = index / 26;
    let char = char::from((65 + (index % 26)) as u8);
    if quotient > 0 {
        return format!("{}{}", format_column_index(quotient - 1), char);
    }
    return char.to_string();
 }
 impl Display for CellPos {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        f.write_fmt(format_args!(
            "column {}, row {}",
            format_column_index(self.column),
            self.row + 1,
        ))
    }
 }
 pub struct CellRange {
    pub start: CellPos,
    pub end: CellPos,
Author	SHA1	Message	Date
n08i40k	cdc89b5bcd	fix(parser): fix sentry error sending	2025-10-10 03:00:47 +04:00
n08i40k	ad86f6cd64	feat(parser): limit names regex to maximum 2 elements This allows us to not worry about subgroups array index overflows, and we can make better non-standard case solving.	2025-10-10 01:39:54 +04:00
n08i40k	a3b4a501db	feat(parser): improve names regex to exclude some non-standard cases Like "Название ФАмилия. И.О.". In that case regex will grab "Название ФА", instead of "Амилия. И. О." (we can't add 'Ф', bc it will make regex checks way more complex). Now it will ignore "Название ФА" if after that lower or upper char is placed. Previously only lower chars are excluded and check won't exclude "Название ФА" and grabs "Название Ф" bc after 'Ф' uppercase char is present.	2025-10-10 01:37:52 +04:00
n08i40k	df0e99a4d0	feat(parser): make lesson cell range less strict to support upcoming split-lessons	2025-10-10 01:31:55 +04:00
n08i40k	a8cf8fb0f5	feat(parser): improve street regex	2025-10-10 01:30:56 +04:00
n08i40k	7ed866138e	feat(error): add error for unknown lesson type	2025-10-10 01:30:30 +04:00
n08i40k	7bac48f8fc	feat(error): add more intuitive CellPos formatting and get rid of ErrorCell	2025-10-10 01:27:05 +04:00
n08i40k	191ec36fef	chore: remove useless commented code	2025-10-10 01:25:12 +04:00