From 665dcbe07da84b11e195d53687473b6a3ae993e3 Mon Sep 17 00:00:00 2001 From: AB Date: Mon, 28 Dec 2020 13:25:53 +0300 Subject: [PATCH] Impl Display for Grammems structs. Added readme. Reworked lexeme detection - now stores all the lexemes order by weight. --- Cargo.toml | 5 +-- README.md | 29 +++++++++++++++ examples/test.rs | 30 +++++++++++++-- src/grammems.rs | 95 +++++++++++++++++++++++++++--------------------- src/lib.rs | 39 +++++++++++++++----- 5 files changed, 140 insertions(+), 58 deletions(-) create mode 100644 README.md diff --git a/Cargo.toml b/Cargo.toml index 0cbee07..7c3d3f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,18 +1,17 @@ [package] name = "mystem" -version = "0.1.0" +version = "0.2.0" authors = ["AB "] license = "WTFPL" edition = "2018" description = "Wrapper around Yandex Mystem for Rust." homepage = "https://github.com/house-of-vanity/mystem-rs" repository = "https://github.com/house-of-vanity/mystem-rs" +readme = "README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] subprocess = "0.2.6" serde_json = "1.0" -env_logger = "0.7" log = { version = "^0.4.5", features = ["std"] } -failure = "0.1" \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1888bc4 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# MyStem Rust Wrapper + +Rust wrapper for the Yandex MyStem 3.1 morpholocial analyzer of the Russian language. + +#### System Requrements +The wrapper was tested on Ubuntu Linux 18.04+, Windows 10. +Mystem binary should be accessible via PATH so manual installation is required. [MyStem Web Site](https://yandex.ru/dev/mystem/) + +###A Quick Example + +```rust +let mut instance = mystem::MyStem::new()?; +for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? { + println!( + "'{}' most likely is a '{}' and lexeme is '{}'.", + stem.text, + stem.lex[0].grammem.part_of_speech, + stem.lex[0].lex + ) +} + +//'Связался' most likely is a 'Verb' and lexeme is 'связываться'. +//'с' most likely is a 'Preposition' and lexeme is 'с'. +//'лучшим' most likely is a 'Adjective' and lexeme is 'хороший'. +//'подохни' most likely is a 'Verb' and lexeme is 'подыхать'. +//'как' most likely is a 'Conjunction' and lexeme is 'как'. +//'все' most likely is a 'AdjectivePronoun' and lexeme is 'весь'. + +``` \ No newline at end of file diff --git a/examples/test.rs b/examples/test.rs index dd27b9e..1b6ed1a 100644 --- a/examples/test.rs +++ b/examples/test.rs @@ -1,13 +1,37 @@ extern crate mystem; +use mystem::Other::Obscene; + #[allow(unused_must_use)] fn main() -> Result<(), mystem::AppError> { let mut instance = mystem::MyStem::new()?; - for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? + for stem in instance.stemming("Связался с лучшим - подохни как все, Говноед.".into())? { - println!("{} is a lexeme of {}", stem.lex, stem.text) + println!( + "'{}' most likely is a '{}' and lexeme is '{}'.{}{}", + stem.text, + stem.lex[0].grammem.part_of_speech, + stem.lex[0].lex, + { + match stem.lex[0] + .grammem + .facts + .contains(&mystem::Fact::Other(Obscene)) + { + true => " Obscene lexis.", + false => "", + } + }, + { + match stem.lex.len() + { + 0|1 => "".to_string(), + x if x > 1 => format!(" Also has {} found lexems.", x), + _ => unreachable!() + } + } + ) } - instance.terminate(); Ok(()) } diff --git a/src/grammems.rs b/src/grammems.rs index 183931c..a07d88e 100644 --- a/src/grammems.rs +++ b/src/grammems.rs @@ -15,6 +15,7 @@ use crate::Tense::{Inpresent, Past, Present}; use crate::Transitivity::{Intransitive, Transitive}; use crate::VerbPerson::{First, Second, Third}; use crate::Voice::{Active, Passive}; +use std::fmt; use std::str::FromStr; #[derive(Debug)] @@ -31,58 +32,63 @@ pub struct Grammem { #[derive(Debug, PartialEq)] pub enum PartOfSpeech { /// прилагательное - A, + Adjective, /// наречие - ADV, + Adverb, /// местоименное наречие - ADVPRO, + AdverbPronominal, /// числительное-прилагательное - ANUM, + AdjectiveNumeral, /// местоимение-прилагательное - APRO, + AdjectivePronoun, /// часть композита - сложного слова - COM, + Composite, /// союз - CONJ, + Conjunction, /// междометие - INTJ, + Interjection, /// числительное - NUM, + Numeral, /// частица - PART, + Particle, /// предлог - PR, + Preposition, /// существительное - S, + Noun, /// местоимение-существительное - SPRO, + AdjectiveNoun, /// глагол - V, + Verb, } impl FromStr for PartOfSpeech { type Err = crate::AppError; fn from_str(input: &str) -> Result { match input { - "A" => Ok(PartOfSpeech::A), - "ADV" => Ok(PartOfSpeech::ADV), - "ADVPRO" => Ok(PartOfSpeech::ADVPRO), - "ANUM" => Ok(PartOfSpeech::ANUM), - "APRO" => Ok(PartOfSpeech::APRO), - "COM" => Ok(PartOfSpeech::COM), - "CONJ" => Ok(PartOfSpeech::CONJ), - "INTJ" => Ok(PartOfSpeech::INTJ), - "NUM" => Ok(PartOfSpeech::NUM), - "PART" => Ok(PartOfSpeech::PART), - "PR" => Ok(PartOfSpeech::PR), - "S" => Ok(PartOfSpeech::S), - "SPRO" => Ok(PartOfSpeech::SPRO), - "V" => Ok(PartOfSpeech::V), + "A" => Ok(PartOfSpeech::Adjective), + "ADV" => Ok(PartOfSpeech::Adverb), + "ADVPRO" => Ok(PartOfSpeech::AdverbPronominal), + "ANUM" => Ok(PartOfSpeech::AdjectiveNumeral), + "APRO" => Ok(PartOfSpeech::AdjectivePronoun), + "COM" => Ok(PartOfSpeech::Composite), + "CONJ" => Ok(PartOfSpeech::Conjunction), + "INTJ" => Ok(PartOfSpeech::Interjection), + "NUM" => Ok(PartOfSpeech::Numeral), + "PART" => Ok(PartOfSpeech::Particle), + "PR" => Ok(PartOfSpeech::Preposition), + "S" => Ok(PartOfSpeech::Noun), + "SPRO" => Ok(PartOfSpeech::AdjectiveNoun), + "V" => Ok(PartOfSpeech::Verb), _ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")), } } } +impl fmt::Display for PartOfSpeech { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Fact { Case(Case), Tense(Tense), @@ -98,8 +104,13 @@ pub enum Fact { Transitivity(Transitivity), Other(Other), } +impl fmt::Display for Fact { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Case { Nominative, //именительный Genitive, //родительный @@ -112,20 +123,20 @@ pub enum Case { Vocative, //звательный } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Tense { Present, //настоящее Inpresent, //непрошедшее Past, //прошедшее } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Plurality { Plural, //настоящее Singular, //непрошедшее } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Mood { Gerunds, //деепричастие Infinitive, //инфинитив @@ -134,58 +145,58 @@ pub enum Mood { Imperative, //повелительное наклонение } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Adjective { Short, //Краткое Long, //Полное Possessive, //притяжательное } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum ComparativeDegree { Superlative, //превосходная Comparative, //сравнительная } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum VerbPerson { First, //1-е лицо Second, //2-е лицо Third, //3-е лицо } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Gender { Masculine, //мужской род Feminine, //женский род Neuter, //средний род } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum PerfectiveAspect { Perfective, //совершенный Imperfective, //несовершенный } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Voice { Passive, //страдательный залог Active, //действительный залог } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Animacy { Animate, //одушевленное Inanimate, //неодушевленное } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Transitivity { Transitive, //переходный глагол Intransitive, //непереходный глагол } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Other { Parenthesis, //вводное слово Geo, //географическое название diff --git a/src/lib.rs b/src/lib.rs index 253d80e..3b8b518 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,21 +12,30 @@ extern crate log; pub use error::*; pub use grammems::*; -/// A Mystem process represented here +/// A Mystem process representation #[derive(Debug)] pub struct MyStem { pub process: Popen, } -/// Stemmed result +/// Lexeme struct #[derive(Debug)] -pub struct Stemming { - /// Original word - pub text: String, +pub struct Lexeme { /// Detected lexeme pub lex: String, /// Detected grammems pub grammem: Grammem, + /// Wight of Lexeme + pub weight: f64, +} + +/// Stemmed result containing `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html) +#[derive(Debug)] +pub struct Stemming { + /// Original word + pub text: String, + /// `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html) of `text`. + pub lex: Vec, } impl MyStem { @@ -41,7 +50,7 @@ impl MyStem { fn open_process() -> Result { Popen::create( - &["mystem", "-d", "-i", "--format", "json", "--eng-gr"], + &["mystem", "-i", "--format", "json", "--eng-gr", "--weight"], PopenConfig { stdout: Redirection::Pipe, stdin: Redirection::Pipe, @@ -119,10 +128,20 @@ impl MyStem { for i in v { stemmings.push(Stemming { text: i["text"].to_string().replace("\"", ""), - lex: i["analysis"][0]["lex"].to_string().replace("\"", ""), - grammem: self.detect_grammems( - i["analysis"][0]["gr"].to_string().replace("\"", ""), - )?, + lex: { + i["analysis"] + .as_array() + .unwrap() + .iter() + .map(|z| Lexeme { + lex: z["lex"].to_string().replace("\"", ""), + grammem: self + .detect_grammems(z["gr"].to_string().replace("\"", "")) + .unwrap(), + weight: z["wt"].as_f64().unwrap_or(1.0), + }) + .collect() + }, }); } Ok(stemmings)