Impl Display for Grammems structs. Added readme. Reworked lexeme detection - now stores all the lexemes order by weight.

2025-07-06 13:14:07 +00:00 · 2020-12-28 13:25:53 +03:00
parent 16118ae6db
commit 665dcbe07d
5 changed files with 140 additions and 58 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,18 +1,17 @@
 [package]
 name = "mystem"
-version = "0.1.0"
+version = "0.2.0"
 authors = ["AB <ab@hexor.ru>"]
 license = "WTFPL"
 edition = "2018"
 description = "Wrapper around Yandex Mystem for Rust."
 homepage = "https://github.com/house-of-vanity/mystem-rs"
 repository = "https://github.com/house-of-vanity/mystem-rs"
+readme = "README.md"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 subprocess = "0.2.6"
 serde_json = "1.0"
-env_logger = "0.7"
 log = { version = "^0.4.5", features = ["std"] }
-failure = "0.1"
--- a/README.md
+++ b/README.md
@ -0,0 +1,29 @@
+# MyStem Rust Wrapper
+
+Rust wrapper for the Yandex MyStem 3.1 morpholocial analyzer of the Russian language.
+
+#### System Requrements
+The wrapper was tested on Ubuntu Linux 18.04+, Windows 10. 
+Mystem binary should be accessible via PATH so manual installation is required. [MyStem Web Site](https://yandex.ru/dev/mystem/)
+
+###A Quick Example
+
+```rust
+let mut instance = mystem::MyStem::new()?;
+for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? {
+    println!(
+        "'{}' most likely is a '{}' and lexeme is '{}'.",
+        stem.text,
+        stem.lex[0].grammem.part_of_speech,
+        stem.lex[0].lex
+    )
+}
+
+//'Связался' most likely is a 'Verb' and lexeme is 'связываться'.
+//'с' most likely is a 'Preposition' and lexeme is 'с'.
+//'лучшим' most likely is a 'Adjective' and lexeme is 'хороший'.
+//'подохни' most likely is a 'Verb' and lexeme is 'подыхать'.
+//'как' most likely is a 'Conjunction' and lexeme is 'как'.
+//'все' most likely is a 'AdjectivePronoun' and lexeme is 'весь'.
+
+```
--- a/examples/test.rs
+++ b/examples/test.rs
@ -1,13 +1,37 @@
 extern crate mystem;

+use mystem::Other::Obscene;
+
 #[allow(unused_must_use)]
 fn main() -> Result<(), mystem::AppError> {
    let mut instance = mystem::MyStem::new()?;
-    for stem in instance.stemming("Связался с лучшим - подохни как все.".into())?
+    for stem in instance.stemming("Связался с лучшим - подохни как все, Говноед.".into())?
    {
-        println!("{} is a lexeme of {}", stem.lex, stem.text)
+        println!(
+            "'{}' most likely is a '{}' and lexeme is '{}'.{}{}",
+            stem.text,
+            stem.lex[0].grammem.part_of_speech,
+            stem.lex[0].lex,
+            {
+                match stem.lex[0]
+                    .grammem
+                    .facts
+                    .contains(&mystem::Fact::Other(Obscene))
+                {
+                    true => " Obscene lexis.",
+                    false => "",
+                }
+            },
+            {
+                match stem.lex.len()
+                {
+                    0|1 => "".to_string(),
+                    x if x > 1 => format!(" Also has {} found lexems.", x),
+                    _ => unreachable!()
+                }
+            }
+        )
    }
-
    instance.terminate();
    Ok(())
 }
--- a/src/grammems.rs
+++ b/src/grammems.rs
@ -15,6 +15,7 @@ use crate::Tense::{Inpresent, Past, Present};
 use crate::Transitivity::{Intransitive, Transitive};
 use crate::VerbPerson::{First, Second, Third};
 use crate::Voice::{Active, Passive};
+use std::fmt;
 use std::str::FromStr;

 #[derive(Debug)]
@ -31,58 +32,63 @@ pub struct Grammem {
 #[derive(Debug, PartialEq)]
 pub enum PartOfSpeech {
    /// прилагательное
-    A,
+    Adjective,
    /// наречие
-    ADV,
+    Adverb,
    /// местоименное наречие
-    ADVPRO,
+    AdverbPronominal,
    /// числительное-прилагательное
-    ANUM,
+    AdjectiveNumeral,
    /// местоимение-прилагательное
-    APRO,
+    AdjectivePronoun,
    /// часть композита - сложного слова
-    COM,
+    Composite,
    /// союз
-    CONJ,
+    Conjunction,
    /// междометие
-    INTJ,
+    Interjection,
    /// числительное
-    NUM,
+    Numeral,
    /// частица
-    PART,
+    Particle,
    /// предлог
-    PR,
+    Preposition,
    /// существительное
-    S,
+    Noun,
    /// местоимение-существительное
-    SPRO,
+    AdjectiveNoun,
    /// глагол
-    V,
+    Verb,
 }
 impl FromStr for PartOfSpeech {
    type Err = crate::AppError;
    fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
        match input {
-            "A" => Ok(PartOfSpeech::A),
-            "ADV" => Ok(PartOfSpeech::ADV),
-            "ADVPRO" => Ok(PartOfSpeech::ADVPRO),
-            "ANUM" => Ok(PartOfSpeech::ANUM),
-            "APRO" => Ok(PartOfSpeech::APRO),
-            "COM" => Ok(PartOfSpeech::COM),
-            "CONJ" => Ok(PartOfSpeech::CONJ),
-            "INTJ" => Ok(PartOfSpeech::INTJ),
-            "NUM" => Ok(PartOfSpeech::NUM),
-            "PART" => Ok(PartOfSpeech::PART),
-            "PR" => Ok(PartOfSpeech::PR),
-            "S" => Ok(PartOfSpeech::S),
-            "SPRO" => Ok(PartOfSpeech::SPRO),
-            "V" => Ok(PartOfSpeech::V),
+            "A" => Ok(PartOfSpeech::Adjective),
+            "ADV" => Ok(PartOfSpeech::Adverb),
+            "ADVPRO" => Ok(PartOfSpeech::AdverbPronominal),
+            "ANUM" => Ok(PartOfSpeech::AdjectiveNumeral),
+            "APRO" => Ok(PartOfSpeech::AdjectivePronoun),
+            "COM" => Ok(PartOfSpeech::Composite),
+            "CONJ" => Ok(PartOfSpeech::Conjunction),
+            "INTJ" => Ok(PartOfSpeech::Interjection),
+            "NUM" => Ok(PartOfSpeech::Numeral),
+            "PART" => Ok(PartOfSpeech::Particle),
+            "PR" => Ok(PartOfSpeech::Preposition),
+            "S" => Ok(PartOfSpeech::Noun),
+            "SPRO" => Ok(PartOfSpeech::AdjectiveNoun),
+            "V" => Ok(PartOfSpeech::Verb),
            _ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
        }
    }
 }
+impl fmt::Display for PartOfSpeech {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Fact {
    Case(Case),
    Tense(Tense),
@ -98,8 +104,13 @@ pub enum Fact {
    Transitivity(Transitivity),
    Other(Other),
 }
+impl fmt::Display for Fact {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Case {
    Nominative,    //именительный
    Genitive,      //родительный
@ -112,20 +123,20 @@ pub enum Case {
    Vocative,      //звательный
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Tense {
    Present,   //настоящее
    Inpresent, //непрошедшее
    Past,      //прошедшее
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Plurality {
    Plural,   //настоящее
    Singular, //непрошедшее
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Mood {
    Gerunds,    //деепричастие
    Infinitive, //инфинитив
@ -134,58 +145,58 @@ pub enum Mood {
    Imperative, //повелительное наклонение
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Adjective {
    Short,      //Краткое
    Long,       //Полное
    Possessive, //притяжательное
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum ComparativeDegree {
    Superlative, //превосходная
    Comparative, //сравнительная
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum VerbPerson {
    First,  //1-е лицо
    Second, //2-е лицо
    Third,  //3-е лицо
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Gender {
    Masculine, //мужской род
    Feminine,  //женский род
    Neuter,    //средний род
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum PerfectiveAspect {
    Perfective,   //совершенный
    Imperfective, //несовершенный
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Voice {
    Passive, //страдательный залог
    Active,  //действительный залог
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Animacy {
    Animate,   //одушевленное
    Inanimate, //неодушевленное
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Transitivity {
    Transitive,   //переходный глагол
    Intransitive, //непереходный глагол
 }

-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Other {
    Parenthesis,  //вводное слово
    Geo,          //географическое название
--- a/src/lib.rs
+++ b/src/lib.rs
@ -12,21 +12,30 @@ extern crate log;
 pub use error::*;
 pub use grammems::*;

-/// A Mystem process represented here
+/// A Mystem process representation
 #[derive(Debug)]
 pub struct MyStem {
    pub process: Popen,
 }

-/// Stemmed result
+/// Lexeme struct
 #[derive(Debug)]
-pub struct Stemming {
-    /// Original word
-    pub text: String,
+pub struct Lexeme {
    /// Detected lexeme
    pub lex: String,
    /// Detected grammems
    pub grammem: Grammem,
+    /// Wight of Lexeme
+    pub weight: f64,
+}
+
+/// Stemmed result containing `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html)
+#[derive(Debug)]
+pub struct Stemming {
+    /// Original word
+    pub text: String,
+    /// `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html) of `text`.
+    pub lex: Vec<Lexeme>,
 }

 impl MyStem {
@ -41,7 +50,7 @@ impl MyStem {

    fn open_process() -> Result<Popen, PopenError> {
        Popen::create(
-            &["mystem", "-d", "-i", "--format", "json", "--eng-gr"],
+            &["mystem", "-i", "--format", "json", "--eng-gr", "--weight"],
            PopenConfig {
                stdout: Redirection::Pipe,
                stdin: Redirection::Pipe,
@ -119,10 +128,20 @@ impl MyStem {
                for i in v {
                    stemmings.push(Stemming {
                        text: i["text"].to_string().replace("\"", ""),
-                        lex: i["analysis"][0]["lex"].to_string().replace("\"", ""),
-                        grammem: self.detect_grammems(
-                            i["analysis"][0]["gr"].to_string().replace("\"", ""),
-                        )?,
+                        lex: {
+                            i["analysis"]
+                                .as_array()
+                                .unwrap()
+                                .iter()
+                                .map(|z| Lexeme {
+                                    lex: z["lex"].to_string().replace("\"", ""),
+                                    grammem: self
+                                        .detect_grammems(z["gr"].to_string().replace("\"", ""))
+                                        .unwrap(),
+                                    weight: z["wt"].as_f64().unwrap_or(1.0),
+                                })
+                                .collect()
+                        },
                    });
                }
                Ok(stemmings)