Impl Display for Grammems structs. Added readme. Reworked lexeme detection - now stores all the lexemes order by weight.

This commit is contained in:
AB
2020-12-28 13:25:53 +03:00
parent 16118ae6db
commit 665dcbe07d
5 changed files with 140 additions and 58 deletions

View File

@@ -1,18 +1,17 @@
[package] [package]
name = "mystem" name = "mystem"
version = "0.1.0" version = "0.2.0"
authors = ["AB <ab@hexor.ru>"] authors = ["AB <ab@hexor.ru>"]
license = "WTFPL" license = "WTFPL"
edition = "2018" edition = "2018"
description = "Wrapper around Yandex Mystem for Rust." description = "Wrapper around Yandex Mystem for Rust."
homepage = "https://github.com/house-of-vanity/mystem-rs" homepage = "https://github.com/house-of-vanity/mystem-rs"
repository = "https://github.com/house-of-vanity/mystem-rs" repository = "https://github.com/house-of-vanity/mystem-rs"
readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
subprocess = "0.2.6" subprocess = "0.2.6"
serde_json = "1.0" serde_json = "1.0"
env_logger = "0.7"
log = { version = "^0.4.5", features = ["std"] } log = { version = "^0.4.5", features = ["std"] }
failure = "0.1"

29
README.md Normal file
View File

@@ -0,0 +1,29 @@
# MyStem Rust Wrapper
Rust wrapper for the Yandex MyStem 3.1 morpholocial analyzer of the Russian language.
#### System Requrements
The wrapper was tested on Ubuntu Linux 18.04+, Windows 10.
Mystem binary should be accessible via PATH so manual installation is required. [MyStem Web Site](https://yandex.ru/dev/mystem/)
###A Quick Example
```rust
let mut instance = mystem::MyStem::new()?;
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? {
println!(
"'{}' most likely is a '{}' and lexeme is '{}'.",
stem.text,
stem.lex[0].grammem.part_of_speech,
stem.lex[0].lex
)
}
//'Связался' most likely is a 'Verb' and lexeme is 'связываться'.
//'с' most likely is a 'Preposition' and lexeme is 'с'.
//'лучшим' most likely is a 'Adjective' and lexeme is 'хороший'.
//'подохни' most likely is a 'Verb' and lexeme is 'подыхать'.
//'как' most likely is a 'Conjunction' and lexeme is 'как'.
//'все' most likely is a 'AdjectivePronoun' and lexeme is 'весь'.
```

View File

@@ -1,13 +1,37 @@
extern crate mystem; extern crate mystem;
use mystem::Other::Obscene;
#[allow(unused_must_use)] #[allow(unused_must_use)]
fn main() -> Result<(), mystem::AppError> { fn main() -> Result<(), mystem::AppError> {
let mut instance = mystem::MyStem::new()?; let mut instance = mystem::MyStem::new()?;
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? for stem in instance.stemming("Связался с лучшим - подохни как все, Говноед.".into())?
{ {
println!("{} is a lexeme of {}", stem.lex, stem.text) println!(
"'{}' most likely is a '{}' and lexeme is '{}'.{}{}",
stem.text,
stem.lex[0].grammem.part_of_speech,
stem.lex[0].lex,
{
match stem.lex[0]
.grammem
.facts
.contains(&mystem::Fact::Other(Obscene))
{
true => " Obscene lexis.",
false => "",
}
},
{
match stem.lex.len()
{
0|1 => "".to_string(),
x if x > 1 => format!(" Also has {} found lexems.", x),
_ => unreachable!()
}
}
)
} }
instance.terminate(); instance.terminate();
Ok(()) Ok(())
} }

View File

@@ -15,6 +15,7 @@ use crate::Tense::{Inpresent, Past, Present};
use crate::Transitivity::{Intransitive, Transitive}; use crate::Transitivity::{Intransitive, Transitive};
use crate::VerbPerson::{First, Second, Third}; use crate::VerbPerson::{First, Second, Third};
use crate::Voice::{Active, Passive}; use crate::Voice::{Active, Passive};
use std::fmt;
use std::str::FromStr; use std::str::FromStr;
#[derive(Debug)] #[derive(Debug)]
@@ -31,58 +32,63 @@ pub struct Grammem {
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum PartOfSpeech { pub enum PartOfSpeech {
/// прилагательное /// прилагательное
A, Adjective,
/// наречие /// наречие
ADV, Adverb,
/// местоименное наречие /// местоименное наречие
ADVPRO, AdverbPronominal,
/// числительное-прилагательное /// числительное-прилагательное
ANUM, AdjectiveNumeral,
/// местоимение-прилагательное /// местоимение-прилагательное
APRO, AdjectivePronoun,
/// часть композита - сложного слова /// часть композита - сложного слова
COM, Composite,
/// союз /// союз
CONJ, Conjunction,
/// междометие /// междометие
INTJ, Interjection,
/// числительное /// числительное
NUM, Numeral,
/// частица /// частица
PART, Particle,
/// предлог /// предлог
PR, Preposition,
/// существительное /// существительное
S, Noun,
/// местоимение-существительное /// местоимение-существительное
SPRO, AdjectiveNoun,
/// глагол /// глагол
V, Verb,
} }
impl FromStr for PartOfSpeech { impl FromStr for PartOfSpeech {
type Err = crate::AppError; type Err = crate::AppError;
fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> { fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
match input { match input {
"A" => Ok(PartOfSpeech::A), "A" => Ok(PartOfSpeech::Adjective),
"ADV" => Ok(PartOfSpeech::ADV), "ADV" => Ok(PartOfSpeech::Adverb),
"ADVPRO" => Ok(PartOfSpeech::ADVPRO), "ADVPRO" => Ok(PartOfSpeech::AdverbPronominal),
"ANUM" => Ok(PartOfSpeech::ANUM), "ANUM" => Ok(PartOfSpeech::AdjectiveNumeral),
"APRO" => Ok(PartOfSpeech::APRO), "APRO" => Ok(PartOfSpeech::AdjectivePronoun),
"COM" => Ok(PartOfSpeech::COM), "COM" => Ok(PartOfSpeech::Composite),
"CONJ" => Ok(PartOfSpeech::CONJ), "CONJ" => Ok(PartOfSpeech::Conjunction),
"INTJ" => Ok(PartOfSpeech::INTJ), "INTJ" => Ok(PartOfSpeech::Interjection),
"NUM" => Ok(PartOfSpeech::NUM), "NUM" => Ok(PartOfSpeech::Numeral),
"PART" => Ok(PartOfSpeech::PART), "PART" => Ok(PartOfSpeech::Particle),
"PR" => Ok(PartOfSpeech::PR), "PR" => Ok(PartOfSpeech::Preposition),
"S" => Ok(PartOfSpeech::S), "S" => Ok(PartOfSpeech::Noun),
"SPRO" => Ok(PartOfSpeech::SPRO), "SPRO" => Ok(PartOfSpeech::AdjectiveNoun),
"V" => Ok(PartOfSpeech::V), "V" => Ok(PartOfSpeech::Verb),
_ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")), _ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
} }
} }
} }
impl fmt::Display for PartOfSpeech {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self)
}
}
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Fact { pub enum Fact {
Case(Case), Case(Case),
Tense(Tense), Tense(Tense),
@@ -98,8 +104,13 @@ pub enum Fact {
Transitivity(Transitivity), Transitivity(Transitivity),
Other(Other), Other(Other),
} }
impl fmt::Display for Fact {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self)
}
}
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Case { pub enum Case {
Nominative, //именительный Nominative, //именительный
Genitive, //родительный Genitive, //родительный
@@ -112,20 +123,20 @@ pub enum Case {
Vocative, //звательный Vocative, //звательный
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Tense { pub enum Tense {
Present, //настоящее Present, //настоящее
Inpresent, //непрошедшее Inpresent, //непрошедшее
Past, //прошедшее Past, //прошедшее
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Plurality { pub enum Plurality {
Plural, //настоящее Plural, //настоящее
Singular, //непрошедшее Singular, //непрошедшее
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Mood { pub enum Mood {
Gerunds, //деепричастие Gerunds, //деепричастие
Infinitive, //инфинитив Infinitive, //инфинитив
@@ -134,58 +145,58 @@ pub enum Mood {
Imperative, //повелительное наклонение Imperative, //повелительное наклонение
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Adjective { pub enum Adjective {
Short, //Краткое Short, //Краткое
Long, //Полное Long, //Полное
Possessive, //притяжательное Possessive, //притяжательное
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum ComparativeDegree { pub enum ComparativeDegree {
Superlative, //превосходная Superlative, //превосходная
Comparative, //сравнительная Comparative, //сравнительная
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum VerbPerson { pub enum VerbPerson {
First, //1-е лицо First, //1-е лицо
Second, //2-е лицо Second, //2-е лицо
Third, //3-е лицо Third, //3-е лицо
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Gender { pub enum Gender {
Masculine, //мужской род Masculine, //мужской род
Feminine, //женский род Feminine, //женский род
Neuter, //средний род Neuter, //средний род
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum PerfectiveAspect { pub enum PerfectiveAspect {
Perfective, //совершенный Perfective, //совершенный
Imperfective, //несовершенный Imperfective, //несовершенный
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Voice { pub enum Voice {
Passive, //страдательный залог Passive, //страдательный залог
Active, //действительный залог Active, //действительный залог
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Animacy { pub enum Animacy {
Animate, //одушевленное Animate, //одушевленное
Inanimate, //неодушевленное Inanimate, //неодушевленное
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Transitivity { pub enum Transitivity {
Transitive, //переходный глагол Transitive, //переходный глагол
Intransitive, //непереходный глагол Intransitive, //непереходный глагол
} }
#[derive(Debug)] #[derive(Debug, PartialEq)]
pub enum Other { pub enum Other {
Parenthesis, //вводное слово Parenthesis, //вводное слово
Geo, //географическое название Geo, //географическое название

View File

@@ -12,21 +12,30 @@ extern crate log;
pub use error::*; pub use error::*;
pub use grammems::*; pub use grammems::*;
/// A Mystem process represented here /// A Mystem process representation
#[derive(Debug)] #[derive(Debug)]
pub struct MyStem { pub struct MyStem {
pub process: Popen, pub process: Popen,
} }
/// Stemmed result /// Lexeme struct
#[derive(Debug)] #[derive(Debug)]
pub struct Stemming { pub struct Lexeme {
/// Original word
pub text: String,
/// Detected lexeme /// Detected lexeme
pub lex: String, pub lex: String,
/// Detected grammems /// Detected grammems
pub grammem: Grammem, pub grammem: Grammem,
/// Wight of Lexeme
pub weight: f64,
}
/// Stemmed result containing `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html)
#[derive(Debug)]
pub struct Stemming {
/// Original word
pub text: String,
/// `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html) of `text`.
pub lex: Vec<Lexeme>,
} }
impl MyStem { impl MyStem {
@@ -41,7 +50,7 @@ impl MyStem {
fn open_process() -> Result<Popen, PopenError> { fn open_process() -> Result<Popen, PopenError> {
Popen::create( Popen::create(
&["mystem", "-d", "-i", "--format", "json", "--eng-gr"], &["mystem", "-i", "--format", "json", "--eng-gr", "--weight"],
PopenConfig { PopenConfig {
stdout: Redirection::Pipe, stdout: Redirection::Pipe,
stdin: Redirection::Pipe, stdin: Redirection::Pipe,
@@ -119,10 +128,20 @@ impl MyStem {
for i in v { for i in v {
stemmings.push(Stemming { stemmings.push(Stemming {
text: i["text"].to_string().replace("\"", ""), text: i["text"].to_string().replace("\"", ""),
lex: i["analysis"][0]["lex"].to_string().replace("\"", ""), lex: {
grammem: self.detect_grammems( i["analysis"]
i["analysis"][0]["gr"].to_string().replace("\"", ""), .as_array()
)?, .unwrap()
.iter()
.map(|z| Lexeme {
lex: z["lex"].to_string().replace("\"", ""),
grammem: self
.detect_grammems(z["gr"].to_string().replace("\"", ""))
.unwrap(),
weight: z["wt"].as_f64().unwrap_or(1.0),
})
.collect()
},
}); });
} }
Ok(stemmings) Ok(stemmings)