8 Commits

Author SHA1 Message Date
cdc89b5bcd fix(parser): fix sentry error sending 2025-10-10 03:00:47 +04:00
ad86f6cd64 feat(parser): limit names regex to maximum 2 elements
This allows us to not worry about subgroups array index overflows, and we can make better non-standard case solving.
2025-10-10 01:39:54 +04:00
a3b4a501db feat(parser): improve names regex to exclude some non-standard cases
Like "Название ФАмилия. И.О.".
In that case regex will grab "Название ФА", instead of "Амилия. И. О." (we can't add 'Ф', bc it will make regex checks way more complex).

Now it will ignore "Название ФА" if after that lower or upper char is placed.
Previously only lower chars are excluded and check won't exclude "Название ФА" and grabs "Название Ф" bc after 'Ф' uppercase char is present.
2025-10-10 01:37:52 +04:00
df0e99a4d0 feat(parser): make lesson cell range less strict to support upcoming split-lessons 2025-10-10 01:31:55 +04:00
a8cf8fb0f5 feat(parser): improve street regex 2025-10-10 01:30:56 +04:00
7ed866138e feat(error): add error for unknown lesson type 2025-10-10 01:30:30 +04:00
7bac48f8fc feat(error): add more intuitive CellPos formatting and get rid of ErrorCell 2025-10-10 01:27:05 +04:00
191ec36fef chore: remove useless commented code 2025-10-10 01:25:12 +04:00
4 changed files with 73 additions and 113 deletions

View File

@@ -102,7 +102,7 @@ pub enum LessonType {
CourseProjectDefense, CourseProjectDefense,
/// Практическое занятие. /// Практическое занятие.
Practice Practice,
} }
#[derive(Clone, Hash, Debug, Serialize, Deserialize, ToSchema)] #[derive(Clone, Hash, Debug, Serialize, Deserialize, ToSchema)]
@@ -212,70 +212,6 @@ impl ScheduleSnapshot {
} }
} }
// #[derive(Clone, Debug, Display, Error, ToSchema)]
// #[display("row {row}, column {column}")]
// pub struct ErrorCellPos {
// pub row: u32,
// pub column: u32,
// }
//
// #[derive(Clone, Debug, Display, Error, ToSchema)]
// #[display("'{data}' at {pos}")]
// pub struct ErrorCell {
// pub pos: ErrorCellPos,
// pub data: String,
// }
//
// impl ErrorCell {
// pub fn new(row: u32, column: u32, data: String) -> Self {
// Self {
// pos: ErrorCellPos { row, column },
// data,
// }
// }
// }
// #[derive(Clone, Debug, Display, Error, ToSchema)]
// pub enum ParseError {
// /// Errors related to reading XLS file.
// #[display("{_0:?}: Failed to read XLS file.")]
// #[schema(value_type = String)]
// BadXLS(Arc<calamine::XlsError>),
//
// /// Not a single sheet was found.
// #[display("No work sheets found.")]
// NoWorkSheets,
//
// /// There are no data on the boundaries of the sheet.
// #[display("There is no data on work sheet boundaries.")]
// UnknownWorkSheetRange,
//
// /// Failed to read the beginning and end of the lesson from the cell
// #[display("Failed to read lesson start and end from {_0}.")]
// LessonBoundaries(ErrorCell),
//
// /// Not found the beginning and the end corresponding to the lesson.
// #[display("No start and end times matching the lesson (at {_0}) was found.")]
// LessonTimeNotFound(ErrorCellPos),
// }
//
// impl Serialize for ParseError {
// fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
// where
// S: Serializer,
// {
// match self {
// ParseError::BadXLS(_) => serializer.serialize_str("BAD_XLS"),
// ParseError::NoWorkSheets => serializer.serialize_str("NO_WORK_SHEETS"),
// ParseError::UnknownWorkSheetRange => {
// serializer.serialize_str("UNKNOWN_WORK_SHEET_RANGE")
// }
// ParseError::LessonBoundaries(_) => serializer.serialize_str("GLOBAL_TIME"),
// ParseError::LessonTimeNotFound(_) => serializer.serialize_str("LESSON_TIME_NOT_FOUND"),
// }
// }
// }
#[async_trait] #[async_trait]
pub trait ScheduleProvider pub trait ScheduleProvider
where where

View File

@@ -1,21 +1,5 @@
use derive_more::{Display, Error, From};
use crate::parser::worksheet::CellPos; use crate::parser::worksheet::CellPos;
use derive_more::{Display, Error, From};
#[derive(Clone, Debug, Display, Error)]
#[display("'{data}' at {pos}")]
pub struct ErrorCell {
pub pos: CellPos,
pub data: String,
}
impl ErrorCell {
pub fn new(row: u32, column: u32, data: &str) -> Self {
Self {
pos: CellPos { row, column },
data: data.to_string(),
}
}
}
#[derive(Debug, Display, Error, From)] #[derive(Debug, Display, Error, From)]
pub enum Error { pub enum Error {
@@ -28,11 +12,14 @@ pub enum Error {
#[display("There is no data on work sheet boundaries.")] #[display("There is no data on work sheet boundaries.")]
UnknownWorkSheetRange, UnknownWorkSheetRange,
#[display("Failed to read lesson start and end from {_0}.")] #[display("Failed to read lesson start and end of lesson at {_0}.")]
NoLessonBoundaries(ErrorCell), NoLessonBoundaries(CellPos),
#[display("No start and end times matching the lesson (at {_0}) was found.")] #[display("No start and end times matching the lesson (at {_0}) was found.")]
LessonTimeNotFound(CellPos), LessonTimeNotFound(CellPos),
#[display("Unknown lesson type `{type}` at {pos}")]
UnknownLessonType { pos: CellPos, r#type: String },
} }
pub type Result<T> = core::result::Result<T, Error>; pub type Result<T> = core::result::Result<T, Error>;

View File

@@ -1,6 +1,5 @@
pub use self::error::{Error, Result}; pub use self::error::{Error, Result};
use crate::or_continue; use crate::or_continue;
use crate::parser::error::ErrorCell;
use crate::parser::worksheet::{CellPos, CellRange, WorkSheet}; use crate::parser::worksheet::{CellPos, CellRange, WorkSheet};
use crate::parser::LessonParseResult::{Lessons, Street}; use crate::parser::LessonParseResult::{Lessons, Street};
use base::LessonType::Break; use base::LessonType::Break;
@@ -217,7 +216,7 @@ fn parse_lesson(
}; };
static OTHER_STREET_RE: LazyLock<Regex> = static OTHER_STREET_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+[,\s]\d+$").unwrap()); LazyLock::new(|| Regex::new(r"^[А-Я][а-я]+[,\s]+д\.\s\d+$").unwrap());
if OTHER_STREET_RE.is_match(&cell_data) { if OTHER_STREET_RE.is_match(&cell_data) {
return Ok(Street(cell_data)); return Ok(Street(cell_data));
@@ -226,12 +225,17 @@ fn parse_lesson(
cell_data cell_data
}; };
let cell_range = worksheet.get_merge_from_start(row, group_column); let lesson_cell_range = worksheet.get_merge_from_start(row, group_column);
let (default_range, lesson_time) = { let (default_range, lesson_time) = {
let end_time_arr = day_boundaries let end_time_arr = day_boundaries
.iter() .iter()
.filter(|time| time.range.end.row == cell_range.end.row) .filter(
|BoundariesData {
range: CellRange { end, .. },
..
}| { lesson_cell_range.end.row <= end.row },
)
.collect::<Vec<&BoundariesData>>(); .collect::<Vec<&BoundariesData>>();
let end_time = end_time_arr let end_time = end_time_arr
@@ -257,12 +261,12 @@ fn parse_lesson(
name, name,
mut subgroups, mut subgroups,
r#type: lesson_type, r#type: lesson_type,
} = parse_name_and_subgroups(&name)?; } = parse_name_and_subgroups(&name, row, group_column)?;
{ {
let cabinets: Vec<String> = parse_cabinets( let cabinets: Vec<String> = parse_cabinets(
worksheet, worksheet,
(cell_range.start.row, cell_range.end.row), (lesson_cell_range.start.row, lesson_cell_range.end.row),
group_column + 1, group_column + 1,
); );
@@ -364,7 +368,7 @@ struct ParsedLessonName {
//noinspection GrazieInspection //noinspection GrazieInspection
/// Getting the "pure" name of the lesson and list of teachers from the text of the lesson cell. /// Getting the "pure" name of the lesson and list of teachers from the text of the lesson cell.
fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> { fn parse_name_and_subgroups(text: &str, row: u32, column: u32) -> Result<ParsedLessonName> {
// Части названия пары: // Части названия пары:
// 1. Само название. // 1. Само название.
// 2. Список преподавателей и подгрупп. // 2. Список преподавателей и подгрупп.
@@ -373,7 +377,7 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
// Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части). // Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части).
static NAME_RE: LazyLock<fancy_regex::Regex> = LazyLock::new(|| { static NAME_RE: LazyLock<fancy_regex::Regex> = LazyLock::new(|| {
fancy_regex::Regex::new( fancy_regex::Regex::new(
r"([А-Я][а-я]+(?:[\s.]*[А-Я]){1,2})(?=[^а-я])[.\s]*(?:\(?(\d)[\sа-я]*\)?)?", r"([А-Я][а-я]+(?:[\s.]*[А-Я]){1,2})(?=[^Аа-я])[.\s]*(?:\(?(\d)[\sа-я]*\)?)?",
) )
.unwrap() .unwrap()
}); });
@@ -394,10 +398,10 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
let mut lesson_name: Option<&str> = None; let mut lesson_name: Option<&str> = None;
let mut extra: Option<&str> = None; let mut extra: Option<&str> = None;
let mut shared_subgroup = false; let mut shared_subgroup = true;
let mut subgroups: [Option<LessonSubGroup>; 2] = [None, None]; let mut subgroups: [Option<LessonSubGroup>; 2] = [None, None];
for capture in NAME_RE.captures_iter(&text) { for capture in NAME_RE.captures_iter(&text).take(2) {
let capture = capture.unwrap(); let capture = capture.unwrap();
if lesson_name.is_none() { if lesson_name.is_none() {
@@ -438,17 +442,23 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
match subgroup_index { match subgroup_index {
None => { None => {
subgroups[0] = subgroup; // we have only 2 matches max so more than 2 subgroups we cant have 100%
subgroups[1] = None; *subgroups.iter_mut().find(|x| x.is_none()).unwrap() = subgroup;
shared_subgroup = true;
break;
} }
Some(num) => { Some(num) => {
// bc we have indexed subgroup
shared_subgroup = false;
// 1 - 1 = 0 | 2 - 1 = 1 | 3 - 1 = 2 (schedule index to array index) // 1 - 1 = 0 | 2 - 1 = 1 | 3 - 1 = 2 (schedule index to array index)
// 0 % 2 = 0 | 1 % 2 = 1 | 2 % 2 = 0 (clamp) // 0 % 2 = 0 | 1 % 2 = 1 | 2 % 2 = 0 (clamp)
let normalised = (num - 1) % 2; let subgroup_index = ((num - 1) % 2) as usize;
subgroups[normalised as usize] = subgroup; // if we have subgroup in that index (probably non-indexed, we change it index to free)
if subgroups[subgroup_index].is_some() {
subgroups.swap(0, 1);
}
subgroups[subgroup_index] = subgroup;
} }
} }
} }
@@ -456,7 +466,7 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
let subgroups = if lesson_name.is_none() { let subgroups = if lesson_name.is_none() {
Vec::new() Vec::new()
} else if shared_subgroup { } else if shared_subgroup {
Vec::from([subgroups[0].take()]) Vec::from([subgroups.into_iter().next().unwrap()])
} else { } else {
Vec::from(subgroups) Vec::from(subgroups)
}; };
@@ -475,13 +485,19 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName> {
if result.is_none() { if result.is_none() {
#[cfg(not(debug_assertions))] #[cfg(not(debug_assertions))]
sentry::capture_message( sentry::capture_error(&Error::UnknownLessonType {
&format!("Не удалось угадать тип пары '{}'!", extra), r#type: extra.to_string(),
sentry::Level::Warning, pos: CellPos::new(row, column),
); });
#[cfg(debug_assertions)] #[cfg(debug_assertions)]
log::warn!("Не удалось угадать тип пары '{}'!", extra); log::warn!(
"{}",
Error::UnknownLessonType {
r#type: extra.to_string(),
pos: CellPos::new(row, column),
}
);
} }
result result
@@ -548,9 +564,8 @@ fn parse_day_boundaries(
continue; continue;
}; };
let lesson_time = parse_lesson_boundaries_cell(&time_cell, date).ok_or( let lesson_time = parse_lesson_boundaries_cell(&time_cell, date)
Error::NoLessonBoundaries(ErrorCell::new(row, column, &time_cell)), .ok_or(Error::NoLessonBoundaries(CellPos::new(row, column)))?;
)?;
// type // type
let lesson_type = if time_cell.contains("пара") { let lesson_type = if time_cell.contains("пара") {

View File

@@ -1,5 +1,5 @@
use derive_more::Display;
use regex::Regex; use regex::Regex;
use std::fmt::{Display, Formatter};
use std::ops::Deref; use std::ops::Deref;
use std::sync::LazyLock; use std::sync::LazyLock;
@@ -9,13 +9,35 @@ pub struct WorkSheet {
pub merges: Vec<calamine::Dimensions>, pub merges: Vec<calamine::Dimensions>,
} }
#[derive(Clone, Debug, Display, derive_more::Error)] #[derive(Clone, Debug, derive_more::Error)]
#[display("row {row}, column {column}")]
pub struct CellPos { pub struct CellPos {
pub row: u32, pub row: u32,
pub column: u32, pub column: u32,
} }
fn format_column_index(index: u32) -> String {
// https://stackoverflow.com/a/297214
let quotient = index / 26;
let char = char::from((65 + (index % 26)) as u8);
if quotient > 0 {
return format!("{}{}", format_column_index(quotient - 1), char);
}
return char.to_string();
}
impl Display for CellPos {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!(
"column {}, row {}",
format_column_index(self.column),
self.row + 1,
))
}
}
pub struct CellRange { pub struct CellRange {
pub start: CellPos, pub start: CellPos,
pub end: CellPos, pub end: CellPos,