mirror of
https://github.com/n08i40k/schedule-parser-rusted.git
synced 2025-12-06 09:47:50 +03:00
Compare commits
2 Commits
b664ba578d
...
69df538467
| Author | SHA1 | Date | |
|---|---|---|---|
|
69df538467
|
|||
|
aa019f8fcf
|
46
Cargo.lock
generated
46
Cargo.lock
generated
@@ -652,6 +652,21 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bit-set"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
|
||||||
|
dependencies = [
|
||||||
|
"bit-vec",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bit-vec"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
version = "2.9.4"
|
version = "2.9.4"
|
||||||
@@ -825,9 +840,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "calamine"
|
name = "calamine"
|
||||||
version = "0.30.1"
|
version = "0.31.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b1a9acfeb1555aa8def91fe8ff208aadaea850c109968ec35ac965edbe7d210b"
|
checksum = "da56b262e8a827c6b12c3dde4ea4622e0ff542bd2e9ea5855e4cb523481d77b7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"atoi_simd",
|
"atoi_simd",
|
||||||
"byteorder 1.5.0",
|
"byteorder 1.5.0",
|
||||||
@@ -835,7 +850,7 @@ dependencies = [
|
|||||||
"encoding_rs",
|
"encoding_rs",
|
||||||
"fast-float2",
|
"fast-float2",
|
||||||
"log",
|
"log",
|
||||||
"quick-xml 0.37.5",
|
"quick-xml",
|
||||||
"serde",
|
"serde",
|
||||||
"zip",
|
"zip",
|
||||||
]
|
]
|
||||||
@@ -1463,6 +1478,17 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fancy-regex"
|
||||||
|
version = "0.16.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f"
|
||||||
|
dependencies = [
|
||||||
|
"bit-set",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fast-float2"
|
name = "fast-float2"
|
||||||
version = "0.2.3"
|
version = "0.2.3"
|
||||||
@@ -2923,7 +2949,7 @@ checksum = "740ebea15c5d1428f910cd1a5f52cebf8d25006245ed8ade92702f4943d91e07"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.22.1",
|
"base64 0.22.1",
|
||||||
"indexmap 2.11.4",
|
"indexmap 2.11.4",
|
||||||
"quick-xml 0.38.3",
|
"quick-xml",
|
||||||
"serde",
|
"serde",
|
||||||
"time 0.3.44",
|
"time 0.3.44",
|
||||||
]
|
]
|
||||||
@@ -3075,6 +3101,7 @@ dependencies = [
|
|||||||
"calamine",
|
"calamine",
|
||||||
"chrono",
|
"chrono",
|
||||||
"derive_more",
|
"derive_more",
|
||||||
|
"fancy-regex",
|
||||||
"log",
|
"log",
|
||||||
"regex",
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
@@ -3121,22 +3148,13 @@ version = "1.2.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "quick-xml"
|
|
||||||
version = "0.37.5"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
|
|
||||||
dependencies = [
|
|
||||||
"encoding_rs",
|
|
||||||
"memchr",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quick-xml"
|
name = "quick-xml"
|
||||||
version = "0.38.3"
|
version = "0.38.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89"
|
checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"encoding_rs",
|
||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ derive_more = { version = "2.0.1", features = ["error", "display"] }
|
|||||||
|
|
||||||
utoipa = { version = "5.4.0", features = ["macros", "chrono"] }
|
utoipa = { version = "5.4.0", features = ["macros", "chrono"] }
|
||||||
|
|
||||||
calamine = "0.30"
|
calamine = "0.31"
|
||||||
async-trait = "0.1.89"
|
async-trait = "0.1.89"
|
||||||
|
|
||||||
reqwest = "0.12.23"
|
reqwest = "0.12.23"
|
||||||
@@ -29,4 +29,5 @@ regex = "1.11.2"
|
|||||||
strsim = "0.11.1"
|
strsim = "0.11.1"
|
||||||
log = "0.4.27"
|
log = "0.4.27"
|
||||||
sentry = "0.43.0"
|
sentry = "0.43.0"
|
||||||
|
fancy-regex = "0.16.2"
|
||||||
|
|
||||||
|
|||||||
@@ -65,7 +65,11 @@ impl ScheduleProvider for Wrapper {
|
|||||||
this.snapshot = Arc::new(snapshot);
|
this.snapshot = Arc::new(snapshot);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
Err(updater::error::Error::QueryUrlFailed(updater::error::QueryUrlError::UriFetchFailed)) => {},
|
||||||
|
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
|
sentry::capture_error(&err);
|
||||||
|
|
||||||
cancellation_token.cancel();
|
cancellation_token.cancel();
|
||||||
return Err(err.into());
|
return Err(err.into());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -233,6 +233,7 @@ enum LessonParseResult {
|
|||||||
fn guess_lesson_type(text: &str) -> Option<LessonType> {
|
fn guess_lesson_type(text: &str) -> Option<LessonType> {
|
||||||
static MAP: LazyLock<HashMap<&str, LessonType>> = LazyLock::new(|| {
|
static MAP: LazyLock<HashMap<&str, LessonType>> = LazyLock::new(|| {
|
||||||
HashMap::from([
|
HashMap::from([
|
||||||
|
("о важном", LessonType::Additional),
|
||||||
("консультация", LessonType::Consultation),
|
("консультация", LessonType::Consultation),
|
||||||
("самостоятельная работа", LessonType::IndependentWork),
|
("самостоятельная работа", LessonType::IndependentWork),
|
||||||
("зачет", LessonType::Exam),
|
("зачет", LessonType::Exam),
|
||||||
@@ -427,127 +428,128 @@ fn parse_name_and_subgroups(text: &str) -> Result<ParsedLessonName, Error> {
|
|||||||
// 3. "Модификатор" (чаще всего).
|
// 3. "Модификатор" (чаще всего).
|
||||||
//
|
//
|
||||||
// Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части).
|
// Регулярное выражение для получения ФИО преподавателей и номеров подгрупп (aka. второй части).
|
||||||
// (?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(\d\s?[а-я]+\))?(?:, )?)+[\s.]*
|
static NAME_RE: LazyLock<fancy_regex::Regex> = LazyLock::new(|| {
|
||||||
//
|
fancy_regex::Regex::new(
|
||||||
// Подробнее:
|
r"([А-Я][а-я]+(?:[\s.]*[А-Я]){1,2})(?=[^а-я])[.\s]*(?:\(?(\d)[\sа-я]*\)?)?",
|
||||||
// (?:
|
|
||||||
// [А-Я][а-я]+ - Фамилия.
|
|
||||||
// \s? - Кто знает, будет ли там пробел.
|
|
||||||
// (?:[А-Я][\s.]*){2} - Имя и отчество с учётом случайных пробелов и точек.
|
|
||||||
// (?:
|
|
||||||
// \( - Открытие подгруппы.
|
|
||||||
// \s? - Кто знает, будет ли там пробел.
|
|
||||||
// \d - Номер подгруппы.
|
|
||||||
// \s? - Кто знает, будет ли там пробел.
|
|
||||||
// [а-я\s]+ - Слово "подгруппа" с учётов ошибок.
|
|
||||||
// \) - Закрытие подгруппы.
|
|
||||||
// )? - Явное указание подгруппы может отсутствовать по понятным причинам.
|
|
||||||
// (?:, )? - Разделители между отдельными частями.
|
|
||||||
// )+
|
|
||||||
// [\s.]* - Забираем с собой всякий мусор, что бы не передать его в третью часть.
|
|
||||||
|
|
||||||
static NAMES_REGEX: LazyLock<Regex> = LazyLock::new(|| {
|
|
||||||
Regex::new(
|
|
||||||
r"(?:[А-Я][а-я]+\s?(?:[А-Я][\s.]*){2}(?:\(?\s*\d\s*[а-я\s]+\)?)?(?:[\s,.]+)?){1,2}+[\s.,]*",
|
|
||||||
)
|
)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
// Отчистка
|
let text = text
|
||||||
static CLEAN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\s\n\t]+").unwrap());
|
.chars()
|
||||||
|
.filter(|c: &char| {
|
||||||
|
c.is_whitespace()
|
||||||
|
|| c.is_ascii_digit()
|
||||||
|
|| (*c >= 'а' && *c <= 'я')
|
||||||
|
|| (*c >= 'А' && *c <= 'Я')
|
||||||
|
|| *c == '.'
|
||||||
|
|| *c == '-'
|
||||||
|
})
|
||||||
|
.collect::<String>()
|
||||||
|
.replace(r"\s+", " ");
|
||||||
|
|
||||||
let text = CLEAN_RE
|
let mut lesson_name: Option<&str> = None;
|
||||||
.replace(&text.replace([' ', '\t', '\n'], " ").replace(",", ""), " ")
|
let mut extra: Option<&str> = None;
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let (lesson_name, subgroups, lesson_type) = match NAMES_REGEX.captures(&text) {
|
let mut shared_subgroup = false;
|
||||||
Some(captures) => {
|
let mut subgroups: [Option<LessonSubGroup>; 2] = [None, None];
|
||||||
let capture = captures.get(0).unwrap();
|
|
||||||
|
|
||||||
let subgroups: Vec<Option<LessonSubGroup>> = {
|
for capture in NAME_RE.captures_iter(&text) {
|
||||||
let src = capture.as_str().replace([' ', '.'], "");
|
let capture = capture.unwrap();
|
||||||
|
|
||||||
let mut shared_subgroup = false;
|
if lesson_name.is_none() {
|
||||||
let mut subgroups: [Option<LessonSubGroup>; 2] = [None, None];
|
lesson_name = Some(&text[..capture.get(0).unwrap().start()]);
|
||||||
|
|
||||||
for name in src.split(',') {
|
|
||||||
let digit_index = name.find(|c: char| c.is_ascii_digit());
|
|
||||||
|
|
||||||
let number: u8 =
|
|
||||||
digit_index.map_or(0, |index| name[(index)..(index + 1)].parse().unwrap());
|
|
||||||
|
|
||||||
let teacher_name = {
|
|
||||||
let name_end = name
|
|
||||||
.find(|c: char| !c.is_alphabetic())
|
|
||||||
.unwrap_or(name.len());
|
|
||||||
|
|
||||||
// Я ебал. Как же я долго до этого доходил.
|
|
||||||
format!(
|
|
||||||
"{} {}.{}.",
|
|
||||||
name.get(..name_end - 4).unwrap(),
|
|
||||||
name.get(name_end - 4..name_end - 2).unwrap(),
|
|
||||||
name.get(name_end - 2..name_end).unwrap(),
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
let lesson = Some(LessonSubGroup {
|
|
||||||
cabinet: None,
|
|
||||||
teacher: Some(teacher_name),
|
|
||||||
});
|
|
||||||
|
|
||||||
match number {
|
|
||||||
0 => {
|
|
||||||
subgroups[0] = lesson;
|
|
||||||
subgroups[1] = None;
|
|
||||||
shared_subgroup = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
num => {
|
|
||||||
// 1 - 1 = 0 | 2 - 1 = 1 | 3 - 1 = 2 (schedule index to array index)
|
|
||||||
// 0 % 2 = 0 | 1 % 2 = 1 | 2 % 2 = 0 (clamp)
|
|
||||||
let normalised = (num - 1) % 2;
|
|
||||||
|
|
||||||
subgroups[normalised as usize] = lesson;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if shared_subgroup {
|
|
||||||
Vec::from([subgroups[0].take()])
|
|
||||||
} else {
|
|
||||||
Vec::from(subgroups)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let name = text[..capture.start()].trim().to_string();
|
|
||||||
let extra = text[capture.end()..].trim().to_string();
|
|
||||||
|
|
||||||
let lesson_type = if extra.len() > 4 {
|
|
||||||
let result = guess_lesson_type(&extra);
|
|
||||||
|
|
||||||
if result.is_none() {
|
|
||||||
#[cfg(not(debug_assertions))]
|
|
||||||
sentry::capture_message(
|
|
||||||
&format!("Не удалось угадать тип пары '{}'!", extra),
|
|
||||||
sentry::Level::Warning,
|
|
||||||
);
|
|
||||||
|
|
||||||
#[cfg(debug_assertions)]
|
|
||||||
log::warn!("Не удалось угадать тип пары '{}'!", extra);
|
|
||||||
}
|
|
||||||
|
|
||||||
result
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
(name, subgroups, lesson_type)
|
|
||||||
}
|
}
|
||||||
None => (text, Vec::new(), None),
|
|
||||||
|
extra = Some(&text[capture.get(0).unwrap().end()..]);
|
||||||
|
|
||||||
|
let teacher_name = {
|
||||||
|
let clean = capture
|
||||||
|
.get(1)
|
||||||
|
.unwrap()
|
||||||
|
.as_str()
|
||||||
|
.chars()
|
||||||
|
.filter(|c| c.is_alphabetic())
|
||||||
|
.collect::<Vec<char>>();
|
||||||
|
|
||||||
|
if clean.get(clean.len() - 2).is_some_and(|c| c.is_uppercase()) {
|
||||||
|
let (name, remaining) = clean.split_at(clean.len() - 2);
|
||||||
|
format!(
|
||||||
|
"{} {}.{}.",
|
||||||
|
name.iter().collect::<String>(),
|
||||||
|
remaining[0],
|
||||||
|
remaining[1]
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
let (remaining, name) = clean.split_last().unwrap();
|
||||||
|
format!("{} {}.", name.iter().collect::<String>(), remaining)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let subgroup_index = capture
|
||||||
|
.get(2)
|
||||||
|
.and_then(|m| Some(m.as_str().parse::<u32>().unwrap()));
|
||||||
|
|
||||||
|
let subgroup = Some(LessonSubGroup {
|
||||||
|
cabinet: None,
|
||||||
|
teacher: Some(teacher_name),
|
||||||
|
});
|
||||||
|
|
||||||
|
match subgroup_index {
|
||||||
|
None => {
|
||||||
|
subgroups[0] = subgroup;
|
||||||
|
subgroups[1] = None;
|
||||||
|
shared_subgroup = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Some(num) => {
|
||||||
|
// 1 - 1 = 0 | 2 - 1 = 1 | 3 - 1 = 2 (schedule index to array index)
|
||||||
|
// 0 % 2 = 0 | 1 % 2 = 1 | 2 % 2 = 0 (clamp)
|
||||||
|
let normalised = (num - 1) % 2;
|
||||||
|
|
||||||
|
subgroups[normalised as usize] = subgroup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let subgroups = if lesson_name.is_none() {
|
||||||
|
Vec::new()
|
||||||
|
} else if shared_subgroup {
|
||||||
|
Vec::from([subgroups[0].take()])
|
||||||
|
} else {
|
||||||
|
Vec::from(subgroups)
|
||||||
|
};
|
||||||
|
|
||||||
|
if extra.is_none() {
|
||||||
|
extra = text
|
||||||
|
.rfind(" ")
|
||||||
|
.and_then(|i| text[..i].rfind(" "))
|
||||||
|
.map(|i| &text[i + 1..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let lesson_type = if let Some(extra) = extra
|
||||||
|
&& extra.len() > 4
|
||||||
|
{
|
||||||
|
let result = guess_lesson_type(&extra);
|
||||||
|
|
||||||
|
if result.is_none() {
|
||||||
|
#[cfg(not(debug_assertions))]
|
||||||
|
sentry::capture_message(
|
||||||
|
&format!("Не удалось угадать тип пары '{}'!", extra),
|
||||||
|
sentry::Level::Warning,
|
||||||
|
);
|
||||||
|
|
||||||
|
#[cfg(debug_assertions)]
|
||||||
|
log::warn!("Не удалось угадать тип пары '{}'!", extra);
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
} else {
|
||||||
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(ParsedLessonName {
|
Ok(ParsedLessonName {
|
||||||
name: lesson_name,
|
name: lesson_name.unwrap_or(&text).to_string(),
|
||||||
subgroups,
|
subgroups,
|
||||||
r#type: lesson_type,
|
r#type: lesson_type,
|
||||||
})
|
})
|
||||||
|
|||||||
Reference in New Issue
Block a user