This commit is contained in:
AB
2020-12-27 23:04:09 +03:00
commit e421d6f382
6 changed files with 451 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/target
Cargo.lock

17
Cargo.toml Normal file
View File

@ -0,0 +1,17 @@
[package]
name = "mystem"
version = "0.1.0"
authors = ["AB <ab@hexor.ru>"]
license = "WTFPL"
edition = "2018"
description = "Wrapper around Yandex Mystem for Rust."
homepage = ""
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
subprocess = "0.2.6"
serde_json = "1.0"
env_logger = "0.7"
log = { version = "^0.4.5", features = ["std"] }
failure = "0.1"

13
examples/test.rs Normal file
View File

@ -0,0 +1,13 @@
extern crate mystem;
fn main() -> Result<(), mystem::AppError> {
let mut instance = mystem::MyStem::new()?;
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())?
{
println!("{} is a lexeme of {}", stem.lex, stem.text)
}
#[allow(unused_must_use)]
instance.terminate();
Ok(())
}

20
src/error.rs Normal file
View File

@ -0,0 +1,20 @@
use std::fmt;
use subprocess::PopenError;
#[derive(Debug)]
pub enum AppError {
PartOfSpeechError(&'static str),
GrammemError(&'static str),
PopenError(PopenError),
MystemError(&'static str),
}
impl fmt::Display for AppError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "An error occurred.")
}
}
impl From<PopenError> for AppError {
fn from(e: PopenError) -> AppError {
return AppError::PopenError(e);
}
}

266
src/grammems.rs Normal file
View File

@ -0,0 +1,266 @@
use crate::error::*;
use crate::Adjective::{Long, Possessive, Short};
use crate::Animacy::{Animate, Inanimate};
use crate::Case::*;
use crate::ComparativeDegree::{Comparative, Superlative};
use crate::Gender::{Feminine, Masculine, Neuter};
use crate::Mood::{Gerunds, Imperative, Indicative, Infinitive, Participle};
use crate::Other::{
Abbreviation, Awkward, CommonForm, Distorted, FamilyName, Geo, Informal, Obscene, Obsolete,
Parenthesis, Patronymic, Predicative, ProperNoun, Rare,
};
use crate::PerfectiveAspect::{Imperfective, Perfective};
use crate::Plurality::{Plural, Singular};
use crate::Tense::{Inpresent, Past, Present};
use crate::Transitivity::{Intransitive, Transitive};
use crate::VerbPerson::{First, Second, Third};
use crate::Voice::{Active, Passive};
use std::str::FromStr;
#[derive(Debug)]
/// Represent grammems for [`Stemming`](./struct.Stemming.html)
pub struct Grammem {
/// Part of speech of [`Stemming`](./struct.Stemming.html)
pub part_of_speech: PartOfSpeech,
/// Parsed `Vec` of [`Facts`](./enum.Fact.html)
pub facts: Vec<Fact>,
/// Non-parsed list of grammems from mystem
pub facts_raw: Vec<String>,
}
#[derive(Debug, PartialEq)]
pub enum PartOfSpeech {
/// прилагательное
A,
/// наречие
ADV,
/// местоименное наречие
ADVPRO,
/// числительное-прилагательное
ANUM,
/// местоимение-прилагательное
APRO,
/// часть композита - сложного слова
COM,
/// союз
CONJ,
/// междометие
INTJ,
/// числительное
NUM,
/// частица
PART,
/// предлог
PR,
/// существительное
S,
/// местоимение-существительное
SPRO,
/// глагол
V,
}
impl FromStr for PartOfSpeech {
type Err = crate::AppError;
fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
match input {
"A" => Ok(PartOfSpeech::A),
"ADV" => Ok(PartOfSpeech::ADV),
"ADVPRO" => Ok(PartOfSpeech::ADVPRO),
"ANUM" => Ok(PartOfSpeech::ANUM),
"APRO" => Ok(PartOfSpeech::APRO),
"COM" => Ok(PartOfSpeech::COM),
"CONJ" => Ok(PartOfSpeech::CONJ),
"INTJ" => Ok(PartOfSpeech::INTJ),
"NUM" => Ok(PartOfSpeech::NUM),
"PART" => Ok(PartOfSpeech::PART),
"PR" => Ok(PartOfSpeech::PR),
"S" => Ok(PartOfSpeech::S),
"SPRO" => Ok(PartOfSpeech::SPRO),
"V" => Ok(PartOfSpeech::V),
_ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
}
}
}
#[derive(Debug)]
pub enum Fact {
Case(Case),
Tense(Tense),
Plurality(Plurality),
Mood(Mood),
Adjective(Adjective),
ComparativeDegree(ComparativeDegree),
Person(VerbPerson),
Gender(Gender),
PerfectiveAspect(PerfectiveAspect),
Voice(Voice),
Animacy(Animacy),
Transitivity(Transitivity),
Other(Other),
}
#[derive(Debug)]
pub enum Case {
Nominative, //именительный
Genitive, //родительный
Dative, //дательный
Accusative, //винительный
Instrumental, //творительный
Prepositional, //предложный
Partitive, //партитив (второй родительный)
Locative, //местный (второй предложный)
Vocative, //звательный
}
#[derive(Debug)]
pub enum Tense {
Present, //настоящее
Inpresent, //непрошедшее
Past, //прошедшее
}
#[derive(Debug)]
pub enum Plurality {
Plural, //настоящее
Singular, //непрошедшее
}
#[derive(Debug)]
pub enum Mood {
Gerunds, //деепричастие
Infinitive, //инфинитив
Participle, //причастие
Indicative, //изьявительное наклонение
Imperative, //повелительное наклонение
}
#[derive(Debug)]
pub enum Adjective {
Short, //Краткое
Long, //Полное
Possessive, //притяжательное
}
#[derive(Debug)]
pub enum ComparativeDegree {
Superlative, //превосходная
Comparative, //сравнительная
}
#[derive(Debug)]
pub enum VerbPerson {
First, //1-е лицо
Second, //2-е лицо
Third, //3-е лицо
}
#[derive(Debug)]
pub enum Gender {
Masculine, //мужской род
Feminine, //женский род
Neuter, //средний род
}
#[derive(Debug)]
pub enum PerfectiveAspect {
Perfective, //совершенный
Imperfective, //несовершенный
}
#[derive(Debug)]
pub enum Voice {
Passive, //страдательный залог
Active, //действительный залог
}
#[derive(Debug)]
pub enum Animacy {
Animate, //одушевленное
Inanimate, //неодушевленное
}
#[derive(Debug)]
pub enum Transitivity {
Transitive, //переходный глагол
Intransitive, //непереходный глагол
}
#[derive(Debug)]
pub enum Other {
Parenthesis, //вводное слово
Geo, //географическое название
Awkward, //образование формы затруднено
ProperNoun, //имя собственное
Distorted, //искаженная форма
CommonForm, //общая форма мужского и женского рода
Obscene, //обсценная лексика
Patronymic, //отчество
Predicative, //предикатив
Informal, //разговорная форма
Rare, //редко встречающееся слово
Abbreviation, //сокращение
Obsolete, //устаревшая форма
FamilyName, //фамилия
}
impl FromStr for Fact {
type Err = crate::AppError;
fn from_str(input: &str) -> Result<Fact, Self::Err> {
match input {
"nom" => Ok(Fact::Case(Nominative)),
"gen" => Ok(Fact::Case(Genitive)),
"dat" => Ok(Fact::Case(Dative)),
"acc" => Ok(Fact::Case(Accusative)),
"ins" => Ok(Fact::Case(Instrumental)),
"abl" => Ok(Fact::Case(Prepositional)),
"part" => Ok(Fact::Case(Partitive)),
"loc" => Ok(Fact::Case(Locative)),
"voc" => Ok(Fact::Case(Vocative)),
"praes" => Ok(Fact::Tense(Present)),
"inpraes" => Ok(Fact::Tense(Inpresent)),
"praet" => Ok(Fact::Tense(Past)),
"sg" => Ok(Fact::Plurality(Singular)),
"pl" => Ok(Fact::Plurality(Plural)),
"ger" => Ok(Fact::Mood(Gerunds)),
"inf" => Ok(Fact::Mood(Infinitive)),
"partcp" => Ok(Fact::Mood(Participle)),
"indic" => Ok(Fact::Mood(Indicative)),
"imper" => Ok(Fact::Mood(Imperative)),
"brev" => Ok(Fact::Adjective(Short)),
"plen" => Ok(Fact::Adjective(Long)),
"poss" => Ok(Fact::Adjective(Possessive)),
"supr" => Ok(Fact::ComparativeDegree(Superlative)),
"comp" => Ok(Fact::ComparativeDegree(Comparative)),
"1p" => Ok(Fact::Person(First)),
"2p" => Ok(Fact::Person(Second)),
"3p" => Ok(Fact::Person(Third)),
"m" => Ok(Fact::Gender(Masculine)),
"f" => Ok(Fact::Gender(Feminine)),
"n" => Ok(Fact::Gender(Neuter)),
"pf" => Ok(Fact::PerfectiveAspect(Perfective)),
"ipf" => Ok(Fact::PerfectiveAspect(Imperfective)),
"act" => Ok(Fact::Voice(Active)),
"pass" => Ok(Fact::Voice(Passive)),
"anim" => Ok(Fact::Animacy(Animate)),
"inan" => Ok(Fact::Animacy(Inanimate)),
"tran" => Ok(Fact::Transitivity(Transitive)),
"intr" => Ok(Fact::Transitivity(Intransitive)),
"parenth" => Ok(Fact::Other(Parenthesis)),
"geo" => Ok(Fact::Other(Geo)),
"awkw" => Ok(Fact::Other(Awkward)),
"persn" => Ok(Fact::Other(ProperNoun)),
"dist" => Ok(Fact::Other(Distorted)),
"mf" => Ok(Fact::Other(CommonForm)),
"obsc" => Ok(Fact::Other(Obscene)),
"patrn" => Ok(Fact::Other(Patronymic)),
"praed" => Ok(Fact::Other(Predicative)),
"inform" => Ok(Fact::Other(Informal)),
"rare" => Ok(Fact::Other(Rare)),
"abbr" => Ok(Fact::Other(Abbreviation)),
"obsol" => Ok(Fact::Other(Obsolete)),
"famn" => Ok(Fact::Other(FamilyName)),
//_ => Ok(Fact::Case(Vocative)),
_ => Err(AppError::GrammemError("Failed to get Grammem.")),
}
}
}

133
src/lib.rs Normal file
View File

@ -0,0 +1,133 @@
#![crate_name = "mystem"]
mod error;
mod grammems;
use serde_json::Value;
use std::io::{prelude::*, BufReader, Error, Write};
use std::str::FromStr;
use subprocess::{Popen, PopenConfig, PopenError, Redirection};
#[macro_use]
extern crate log;
use std::collections::{HashMap, HashSet};
pub use error::*;
pub use grammems::*;
/// A Mystem process represented here
#[derive(Debug)]
pub struct MyStem {
pub process: Popen,
}
/// Stemmed result
#[derive(Debug)]
pub struct Stemming {
/// Original word
pub text: String,
/// Detected lexeme
pub lex: String,
/// Detected grammems
pub grammem: Grammem,
}
impl MyStem {
/// Returns a MyStem instance with running process
/// of mystem binary. It keeps mystem running all the time
/// and reuse it.
pub fn new() -> Result<Self, AppError> {
let p = MyStem::open_process()?;
debug!("Mystem started with PID {}", p.pid().unwrap());
Ok(Self { process: p })
}
fn open_process() -> Result<Popen, PopenError> {
Popen::create(
&["mystem", "-d", "-i", "--format", "json", "--eng-gr"],
PopenConfig {
stdout: Redirection::Pipe,
stdin: Redirection::Pipe,
..Default::default()
},
)
}
/// Terminate mystem instance.
#[allow(dead_code)]
pub fn terminate(&mut self) -> Result<(), Error> {
self.process.terminate()
}
fn detect_grammems(&mut self, gr: String) -> Result<Grammem, AppError> {
let mut res: Vec<String> = gr
.split(|s| s == '=' || s == ',')
.map(|s| s.to_string())
.collect();
res.retain(|x| x != "");
Ok(Grammem {
part_of_speech: PartOfSpeech::from_str(res[0].as_str())?,
facts: res
.clone()
.split_off(1)
.iter_mut()
.map(|f| Fact::from_str(f).unwrap())
.collect(),
facts_raw: res.split_off(1),
})
}
/// Returns `Vec` with [`mystem::Stemming`](./struct.Stemming.html) for each word in `text`
/// # Examples
///
/// ```rust
/// let mut instance = mystem::MyStem::new()?;
/// for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? {
/// println!("{} is a lexeme of {}", stem.lex, stem.text)
/// }
/// // связываться is a lexeme of Связался
/// // с is a lexeme of с
/// // хороший is a lexeme of лучшим
/// // подыхать is a lexeme of подохни
/// // как is a lexeme of как
/// // все is a lexeme of все
/// ```
pub fn stemming(&mut self, text: String) -> Result<Vec<Stemming>, AppError> {
if let Some(exit_status) = self.process.poll() {
warn!(
"MyStem process ({:?}) exited with: {:?}. Restarting...",
self.process.pid().unwrap(),
exit_status
);
self.process = MyStem::open_process()?;
}
let clean_text = format!("{}{}", text.trim(), "\n");
self.process
.stdin
.as_ref()
.unwrap()
.write(clean_text.as_bytes());
let mut contents = String::new();
let mut buf_reader = BufReader::new(self.process.stdout.as_ref().unwrap());
buf_reader.read_line(&mut contents);
let mut stemmings: Vec<Stemming> = Vec::new();
match Some(contents) {
Some(contents) => {
let v: Vec<Value> = match serde_json::from_str(contents.as_str()) {
Ok(val) => val,
Err(_) => return Ok(vec![]),
};
for i in v {
stemmings.push(Stemming {
text: i["text"].to_string().replace("\"", ""),
lex: i["analysis"][0]["lex"].to_string().replace("\"", ""),
grammem: self.detect_grammems(
i["analysis"][0]["gr"].to_string().replace("\"", ""),
)?,
});
}
Ok(stemmings)
}
None => return Ok(vec![]),
}
}
}