Impl Display for Grammems structs. Added readme. Reworked lexeme detection - now stores all the lexemes order by weight.

This commit is contained in:
AB
2020-12-28 13:25:53 +03:00
parent 16118ae6db
commit 665dcbe07d
5 changed files with 140 additions and 58 deletions

View File

@ -1,18 +1,17 @@
[package]
name = "mystem"
version = "0.1.0"
version = "0.2.0"
authors = ["AB <ab@hexor.ru>"]
license = "WTFPL"
edition = "2018"
description = "Wrapper around Yandex Mystem for Rust."
homepage = "https://github.com/house-of-vanity/mystem-rs"
repository = "https://github.com/house-of-vanity/mystem-rs"
readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
subprocess = "0.2.6"
serde_json = "1.0"
env_logger = "0.7"
log = { version = "^0.4.5", features = ["std"] }
failure = "0.1"

29
README.md Normal file
View File

@ -0,0 +1,29 @@
# MyStem Rust Wrapper
Rust wrapper for the Yandex MyStem 3.1 morpholocial analyzer of the Russian language.
#### System Requrements
The wrapper was tested on Ubuntu Linux 18.04+, Windows 10.
Mystem binary should be accessible via PATH so manual installation is required. [MyStem Web Site](https://yandex.ru/dev/mystem/)
###A Quick Example
```rust
let mut instance = mystem::MyStem::new()?;
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? {
println!(
"'{}' most likely is a '{}' and lexeme is '{}'.",
stem.text,
stem.lex[0].grammem.part_of_speech,
stem.lex[0].lex
)
}
//'Связался' most likely is a 'Verb' and lexeme is 'связываться'.
//'с' most likely is a 'Preposition' and lexeme is 'с'.
//'лучшим' most likely is a 'Adjective' and lexeme is 'хороший'.
//'подохни' most likely is a 'Verb' and lexeme is 'подыхать'.
//'как' most likely is a 'Conjunction' and lexeme is 'как'.
//'все' most likely is a 'AdjectivePronoun' and lexeme is 'весь'.
```

View File

@ -1,13 +1,37 @@
extern crate mystem;
use mystem::Other::Obscene;
#[allow(unused_must_use)]
fn main() -> Result<(), mystem::AppError> {
let mut instance = mystem::MyStem::new()?;
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())?
for stem in instance.stemming("Связался с лучшим - подохни как все, Говноед.".into())?
{
println!("{} is a lexeme of {}", stem.lex, stem.text)
println!(
"'{}' most likely is a '{}' and lexeme is '{}'.{}{}",
stem.text,
stem.lex[0].grammem.part_of_speech,
stem.lex[0].lex,
{
match stem.lex[0]
.grammem
.facts
.contains(&mystem::Fact::Other(Obscene))
{
true => " Obscene lexis.",
false => "",
}
},
{
match stem.lex.len()
{
0|1 => "".to_string(),
x if x > 1 => format!(" Also has {} found lexems.", x),
_ => unreachable!()
}
}
)
}
instance.terminate();
Ok(())
}

View File

@ -15,6 +15,7 @@ use crate::Tense::{Inpresent, Past, Present};
use crate::Transitivity::{Intransitive, Transitive};
use crate::VerbPerson::{First, Second, Third};
use crate::Voice::{Active, Passive};
use std::fmt;
use std::str::FromStr;
#[derive(Debug)]
@ -31,58 +32,63 @@ pub struct Grammem {
#[derive(Debug, PartialEq)]
pub enum PartOfSpeech {
/// прилагательное
A,
Adjective,
/// наречие
ADV,
Adverb,
/// местоименное наречие
ADVPRO,
AdverbPronominal,
/// числительное-прилагательное
ANUM,
AdjectiveNumeral,
/// местоимение-прилагательное
APRO,
AdjectivePronoun,
/// часть композита - сложного слова
COM,
Composite,
/// союз
CONJ,
Conjunction,
/// междометие
INTJ,
Interjection,
/// числительное
NUM,
Numeral,
/// частица
PART,
Particle,
/// предлог
PR,
Preposition,
/// существительное
S,
Noun,
/// местоимение-существительное
SPRO,
AdjectiveNoun,
/// глагол
V,
Verb,
}
impl FromStr for PartOfSpeech {
type Err = crate::AppError;
fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
match input {
"A" => Ok(PartOfSpeech::A),
"ADV" => Ok(PartOfSpeech::ADV),
"ADVPRO" => Ok(PartOfSpeech::ADVPRO),
"ANUM" => Ok(PartOfSpeech::ANUM),
"APRO" => Ok(PartOfSpeech::APRO),
"COM" => Ok(PartOfSpeech::COM),
"CONJ" => Ok(PartOfSpeech::CONJ),
"INTJ" => Ok(PartOfSpeech::INTJ),
"NUM" => Ok(PartOfSpeech::NUM),
"PART" => Ok(PartOfSpeech::PART),
"PR" => Ok(PartOfSpeech::PR),
"S" => Ok(PartOfSpeech::S),
"SPRO" => Ok(PartOfSpeech::SPRO),
"V" => Ok(PartOfSpeech::V),
"A" => Ok(PartOfSpeech::Adjective),
"ADV" => Ok(PartOfSpeech::Adverb),
"ADVPRO" => Ok(PartOfSpeech::AdverbPronominal),
"ANUM" => Ok(PartOfSpeech::AdjectiveNumeral),
"APRO" => Ok(PartOfSpeech::AdjectivePronoun),
"COM" => Ok(PartOfSpeech::Composite),
"CONJ" => Ok(PartOfSpeech::Conjunction),
"INTJ" => Ok(PartOfSpeech::Interjection),
"NUM" => Ok(PartOfSpeech::Numeral),
"PART" => Ok(PartOfSpeech::Particle),
"PR" => Ok(PartOfSpeech::Preposition),
"S" => Ok(PartOfSpeech::Noun),
"SPRO" => Ok(PartOfSpeech::AdjectiveNoun),
"V" => Ok(PartOfSpeech::Verb),
_ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
}
}
}
impl fmt::Display for PartOfSpeech {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self)
}
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Fact {
Case(Case),
Tense(Tense),
@ -98,8 +104,13 @@ pub enum Fact {
Transitivity(Transitivity),
Other(Other),
}
impl fmt::Display for Fact {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self)
}
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Case {
Nominative, //именительный
Genitive, //родительный
@ -112,20 +123,20 @@ pub enum Case {
Vocative, //звательный
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Tense {
Present, //настоящее
Inpresent, //непрошедшее
Past, //прошедшее
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Plurality {
Plural, //настоящее
Singular, //непрошедшее
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Mood {
Gerunds, //деепричастие
Infinitive, //инфинитив
@ -134,58 +145,58 @@ pub enum Mood {
Imperative, //повелительное наклонение
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Adjective {
Short, //Краткое
Long, //Полное
Possessive, //притяжательное
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum ComparativeDegree {
Superlative, //превосходная
Comparative, //сравнительная
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum VerbPerson {
First, //1-е лицо
Second, //2-е лицо
Third, //3-е лицо
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Gender {
Masculine, //мужской род
Feminine, //женский род
Neuter, //средний род
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum PerfectiveAspect {
Perfective, //совершенный
Imperfective, //несовершенный
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Voice {
Passive, //страдательный залог
Active, //действительный залог
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Animacy {
Animate, //одушевленное
Inanimate, //неодушевленное
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Transitivity {
Transitive, //переходный глагол
Intransitive, //непереходный глагол
}
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum Other {
Parenthesis, //вводное слово
Geo, //географическое название

View File

@ -12,21 +12,30 @@ extern crate log;
pub use error::*;
pub use grammems::*;
/// A Mystem process represented here
/// A Mystem process representation
#[derive(Debug)]
pub struct MyStem {
pub process: Popen,
}
/// Stemmed result
/// Lexeme struct
#[derive(Debug)]
pub struct Stemming {
/// Original word
pub text: String,
pub struct Lexeme {
/// Detected lexeme
pub lex: String,
/// Detected grammems
pub grammem: Grammem,
/// Wight of Lexeme
pub weight: f64,
}
/// Stemmed result containing `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html)
#[derive(Debug)]
pub struct Stemming {
/// Original word
pub text: String,
/// `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html) of `text`.
pub lex: Vec<Lexeme>,
}
impl MyStem {
@ -41,7 +50,7 @@ impl MyStem {
fn open_process() -> Result<Popen, PopenError> {
Popen::create(
&["mystem", "-d", "-i", "--format", "json", "--eng-gr"],
&["mystem", "-i", "--format", "json", "--eng-gr", "--weight"],
PopenConfig {
stdout: Redirection::Pipe,
stdin: Redirection::Pipe,
@ -119,10 +128,20 @@ impl MyStem {
for i in v {
stemmings.push(Stemming {
text: i["text"].to_string().replace("\"", ""),
lex: i["analysis"][0]["lex"].to_string().replace("\"", ""),
grammem: self.detect_grammems(
i["analysis"][0]["gr"].to_string().replace("\"", ""),
)?,
lex: {
i["analysis"]
.as_array()
.unwrap()
.iter()
.map(|z| Lexeme {
lex: z["lex"].to_string().replace("\"", ""),
grammem: self
.detect_grammems(z["gr"].to_string().replace("\"", ""))
.unwrap(),
weight: z["wt"].as_f64().unwrap_or(1.0),
})
.collect()
},
});
}
Ok(stemmings)