mirror of
https://github.com/house-of-vanity/mystem-rs.git
synced 2025-08-21 16:07:15 +00:00
Impl Display for Grammems structs. Added readme. Reworked lexeme detection - now stores all the lexemes order by weight.
This commit is contained in:
@@ -1,18 +1,17 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mystem"
|
name = "mystem"
|
||||||
version = "0.1.0"
|
version = "0.2.0"
|
||||||
authors = ["AB <ab@hexor.ru>"]
|
authors = ["AB <ab@hexor.ru>"]
|
||||||
license = "WTFPL"
|
license = "WTFPL"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
description = "Wrapper around Yandex Mystem for Rust."
|
description = "Wrapper around Yandex Mystem for Rust."
|
||||||
homepage = "https://github.com/house-of-vanity/mystem-rs"
|
homepage = "https://github.com/house-of-vanity/mystem-rs"
|
||||||
repository = "https://github.com/house-of-vanity/mystem-rs"
|
repository = "https://github.com/house-of-vanity/mystem-rs"
|
||||||
|
readme = "README.md"
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
subprocess = "0.2.6"
|
subprocess = "0.2.6"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
env_logger = "0.7"
|
|
||||||
log = { version = "^0.4.5", features = ["std"] }
|
log = { version = "^0.4.5", features = ["std"] }
|
||||||
failure = "0.1"
|
|
29
README.md
Normal file
29
README.md
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# MyStem Rust Wrapper
|
||||||
|
|
||||||
|
Rust wrapper for the Yandex MyStem 3.1 morpholocial analyzer of the Russian language.
|
||||||
|
|
||||||
|
#### System Requrements
|
||||||
|
The wrapper was tested on Ubuntu Linux 18.04+, Windows 10.
|
||||||
|
Mystem binary should be accessible via PATH so manual installation is required. [MyStem Web Site](https://yandex.ru/dev/mystem/)
|
||||||
|
|
||||||
|
###A Quick Example
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let mut instance = mystem::MyStem::new()?;
|
||||||
|
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? {
|
||||||
|
println!(
|
||||||
|
"'{}' most likely is a '{}' and lexeme is '{}'.",
|
||||||
|
stem.text,
|
||||||
|
stem.lex[0].grammem.part_of_speech,
|
||||||
|
stem.lex[0].lex
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
//'Связался' most likely is a 'Verb' and lexeme is 'связываться'.
|
||||||
|
//'с' most likely is a 'Preposition' and lexeme is 'с'.
|
||||||
|
//'лучшим' most likely is a 'Adjective' and lexeme is 'хороший'.
|
||||||
|
//'подохни' most likely is a 'Verb' and lexeme is 'подыхать'.
|
||||||
|
//'как' most likely is a 'Conjunction' and lexeme is 'как'.
|
||||||
|
//'все' most likely is a 'AdjectivePronoun' and lexeme is 'весь'.
|
||||||
|
|
||||||
|
```
|
@@ -1,13 +1,37 @@
|
|||||||
extern crate mystem;
|
extern crate mystem;
|
||||||
|
|
||||||
|
use mystem::Other::Obscene;
|
||||||
|
|
||||||
#[allow(unused_must_use)]
|
#[allow(unused_must_use)]
|
||||||
fn main() -> Result<(), mystem::AppError> {
|
fn main() -> Result<(), mystem::AppError> {
|
||||||
let mut instance = mystem::MyStem::new()?;
|
let mut instance = mystem::MyStem::new()?;
|
||||||
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())?
|
for stem in instance.stemming("Связался с лучшим - подохни как все, Говноед.".into())?
|
||||||
{
|
{
|
||||||
println!("{} is a lexeme of {}", stem.lex, stem.text)
|
println!(
|
||||||
|
"'{}' most likely is a '{}' and lexeme is '{}'.{}{}",
|
||||||
|
stem.text,
|
||||||
|
stem.lex[0].grammem.part_of_speech,
|
||||||
|
stem.lex[0].lex,
|
||||||
|
{
|
||||||
|
match stem.lex[0]
|
||||||
|
.grammem
|
||||||
|
.facts
|
||||||
|
.contains(&mystem::Fact::Other(Obscene))
|
||||||
|
{
|
||||||
|
true => " Obscene lexis.",
|
||||||
|
false => "",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
match stem.lex.len()
|
||||||
|
{
|
||||||
|
0|1 => "".to_string(),
|
||||||
|
x if x > 1 => format!(" Also has {} found lexems.", x),
|
||||||
|
_ => unreachable!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
instance.terminate();
|
instance.terminate();
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@@ -15,6 +15,7 @@ use crate::Tense::{Inpresent, Past, Present};
|
|||||||
use crate::Transitivity::{Intransitive, Transitive};
|
use crate::Transitivity::{Intransitive, Transitive};
|
||||||
use crate::VerbPerson::{First, Second, Third};
|
use crate::VerbPerson::{First, Second, Third};
|
||||||
use crate::Voice::{Active, Passive};
|
use crate::Voice::{Active, Passive};
|
||||||
|
use std::fmt;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -31,58 +32,63 @@ pub struct Grammem {
|
|||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum PartOfSpeech {
|
pub enum PartOfSpeech {
|
||||||
/// прилагательное
|
/// прилагательное
|
||||||
A,
|
Adjective,
|
||||||
/// наречие
|
/// наречие
|
||||||
ADV,
|
Adverb,
|
||||||
/// местоименное наречие
|
/// местоименное наречие
|
||||||
ADVPRO,
|
AdverbPronominal,
|
||||||
/// числительное-прилагательное
|
/// числительное-прилагательное
|
||||||
ANUM,
|
AdjectiveNumeral,
|
||||||
/// местоимение-прилагательное
|
/// местоимение-прилагательное
|
||||||
APRO,
|
AdjectivePronoun,
|
||||||
/// часть композита - сложного слова
|
/// часть композита - сложного слова
|
||||||
COM,
|
Composite,
|
||||||
/// союз
|
/// союз
|
||||||
CONJ,
|
Conjunction,
|
||||||
/// междометие
|
/// междометие
|
||||||
INTJ,
|
Interjection,
|
||||||
/// числительное
|
/// числительное
|
||||||
NUM,
|
Numeral,
|
||||||
/// частица
|
/// частица
|
||||||
PART,
|
Particle,
|
||||||
/// предлог
|
/// предлог
|
||||||
PR,
|
Preposition,
|
||||||
/// существительное
|
/// существительное
|
||||||
S,
|
Noun,
|
||||||
/// местоимение-существительное
|
/// местоимение-существительное
|
||||||
SPRO,
|
AdjectiveNoun,
|
||||||
/// глагол
|
/// глагол
|
||||||
V,
|
Verb,
|
||||||
}
|
}
|
||||||
impl FromStr for PartOfSpeech {
|
impl FromStr for PartOfSpeech {
|
||||||
type Err = crate::AppError;
|
type Err = crate::AppError;
|
||||||
fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
|
fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
|
||||||
match input {
|
match input {
|
||||||
"A" => Ok(PartOfSpeech::A),
|
"A" => Ok(PartOfSpeech::Adjective),
|
||||||
"ADV" => Ok(PartOfSpeech::ADV),
|
"ADV" => Ok(PartOfSpeech::Adverb),
|
||||||
"ADVPRO" => Ok(PartOfSpeech::ADVPRO),
|
"ADVPRO" => Ok(PartOfSpeech::AdverbPronominal),
|
||||||
"ANUM" => Ok(PartOfSpeech::ANUM),
|
"ANUM" => Ok(PartOfSpeech::AdjectiveNumeral),
|
||||||
"APRO" => Ok(PartOfSpeech::APRO),
|
"APRO" => Ok(PartOfSpeech::AdjectivePronoun),
|
||||||
"COM" => Ok(PartOfSpeech::COM),
|
"COM" => Ok(PartOfSpeech::Composite),
|
||||||
"CONJ" => Ok(PartOfSpeech::CONJ),
|
"CONJ" => Ok(PartOfSpeech::Conjunction),
|
||||||
"INTJ" => Ok(PartOfSpeech::INTJ),
|
"INTJ" => Ok(PartOfSpeech::Interjection),
|
||||||
"NUM" => Ok(PartOfSpeech::NUM),
|
"NUM" => Ok(PartOfSpeech::Numeral),
|
||||||
"PART" => Ok(PartOfSpeech::PART),
|
"PART" => Ok(PartOfSpeech::Particle),
|
||||||
"PR" => Ok(PartOfSpeech::PR),
|
"PR" => Ok(PartOfSpeech::Preposition),
|
||||||
"S" => Ok(PartOfSpeech::S),
|
"S" => Ok(PartOfSpeech::Noun),
|
||||||
"SPRO" => Ok(PartOfSpeech::SPRO),
|
"SPRO" => Ok(PartOfSpeech::AdjectiveNoun),
|
||||||
"V" => Ok(PartOfSpeech::V),
|
"V" => Ok(PartOfSpeech::Verb),
|
||||||
_ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
|
_ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
impl fmt::Display for PartOfSpeech {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(f, "{:?}", self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Fact {
|
pub enum Fact {
|
||||||
Case(Case),
|
Case(Case),
|
||||||
Tense(Tense),
|
Tense(Tense),
|
||||||
@@ -98,8 +104,13 @@ pub enum Fact {
|
|||||||
Transitivity(Transitivity),
|
Transitivity(Transitivity),
|
||||||
Other(Other),
|
Other(Other),
|
||||||
}
|
}
|
||||||
|
impl fmt::Display for Fact {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(f, "{:?}", self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Case {
|
pub enum Case {
|
||||||
Nominative, //именительный
|
Nominative, //именительный
|
||||||
Genitive, //родительный
|
Genitive, //родительный
|
||||||
@@ -112,20 +123,20 @@ pub enum Case {
|
|||||||
Vocative, //звательный
|
Vocative, //звательный
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Tense {
|
pub enum Tense {
|
||||||
Present, //настоящее
|
Present, //настоящее
|
||||||
Inpresent, //непрошедшее
|
Inpresent, //непрошедшее
|
||||||
Past, //прошедшее
|
Past, //прошедшее
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Plurality {
|
pub enum Plurality {
|
||||||
Plural, //настоящее
|
Plural, //настоящее
|
||||||
Singular, //непрошедшее
|
Singular, //непрошедшее
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Mood {
|
pub enum Mood {
|
||||||
Gerunds, //деепричастие
|
Gerunds, //деепричастие
|
||||||
Infinitive, //инфинитив
|
Infinitive, //инфинитив
|
||||||
@@ -134,58 +145,58 @@ pub enum Mood {
|
|||||||
Imperative, //повелительное наклонение
|
Imperative, //повелительное наклонение
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Adjective {
|
pub enum Adjective {
|
||||||
Short, //Краткое
|
Short, //Краткое
|
||||||
Long, //Полное
|
Long, //Полное
|
||||||
Possessive, //притяжательное
|
Possessive, //притяжательное
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum ComparativeDegree {
|
pub enum ComparativeDegree {
|
||||||
Superlative, //превосходная
|
Superlative, //превосходная
|
||||||
Comparative, //сравнительная
|
Comparative, //сравнительная
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum VerbPerson {
|
pub enum VerbPerson {
|
||||||
First, //1-е лицо
|
First, //1-е лицо
|
||||||
Second, //2-е лицо
|
Second, //2-е лицо
|
||||||
Third, //3-е лицо
|
Third, //3-е лицо
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Gender {
|
pub enum Gender {
|
||||||
Masculine, //мужской род
|
Masculine, //мужской род
|
||||||
Feminine, //женский род
|
Feminine, //женский род
|
||||||
Neuter, //средний род
|
Neuter, //средний род
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum PerfectiveAspect {
|
pub enum PerfectiveAspect {
|
||||||
Perfective, //совершенный
|
Perfective, //совершенный
|
||||||
Imperfective, //несовершенный
|
Imperfective, //несовершенный
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Voice {
|
pub enum Voice {
|
||||||
Passive, //страдательный залог
|
Passive, //страдательный залог
|
||||||
Active, //действительный залог
|
Active, //действительный залог
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Animacy {
|
pub enum Animacy {
|
||||||
Animate, //одушевленное
|
Animate, //одушевленное
|
||||||
Inanimate, //неодушевленное
|
Inanimate, //неодушевленное
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Transitivity {
|
pub enum Transitivity {
|
||||||
Transitive, //переходный глагол
|
Transitive, //переходный глагол
|
||||||
Intransitive, //непереходный глагол
|
Intransitive, //непереходный глагол
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Other {
|
pub enum Other {
|
||||||
Parenthesis, //вводное слово
|
Parenthesis, //вводное слово
|
||||||
Geo, //географическое название
|
Geo, //географическое название
|
||||||
|
39
src/lib.rs
39
src/lib.rs
@@ -12,21 +12,30 @@ extern crate log;
|
|||||||
pub use error::*;
|
pub use error::*;
|
||||||
pub use grammems::*;
|
pub use grammems::*;
|
||||||
|
|
||||||
/// A Mystem process represented here
|
/// A Mystem process representation
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct MyStem {
|
pub struct MyStem {
|
||||||
pub process: Popen,
|
pub process: Popen,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Stemmed result
|
/// Lexeme struct
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Stemming {
|
pub struct Lexeme {
|
||||||
/// Original word
|
|
||||||
pub text: String,
|
|
||||||
/// Detected lexeme
|
/// Detected lexeme
|
||||||
pub lex: String,
|
pub lex: String,
|
||||||
/// Detected grammems
|
/// Detected grammems
|
||||||
pub grammem: Grammem,
|
pub grammem: Grammem,
|
||||||
|
/// Wight of Lexeme
|
||||||
|
pub weight: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stemmed result containing `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html)
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Stemming {
|
||||||
|
/// Original word
|
||||||
|
pub text: String,
|
||||||
|
/// `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html) of `text`.
|
||||||
|
pub lex: Vec<Lexeme>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MyStem {
|
impl MyStem {
|
||||||
@@ -41,7 +50,7 @@ impl MyStem {
|
|||||||
|
|
||||||
fn open_process() -> Result<Popen, PopenError> {
|
fn open_process() -> Result<Popen, PopenError> {
|
||||||
Popen::create(
|
Popen::create(
|
||||||
&["mystem", "-d", "-i", "--format", "json", "--eng-gr"],
|
&["mystem", "-i", "--format", "json", "--eng-gr", "--weight"],
|
||||||
PopenConfig {
|
PopenConfig {
|
||||||
stdout: Redirection::Pipe,
|
stdout: Redirection::Pipe,
|
||||||
stdin: Redirection::Pipe,
|
stdin: Redirection::Pipe,
|
||||||
@@ -119,10 +128,20 @@ impl MyStem {
|
|||||||
for i in v {
|
for i in v {
|
||||||
stemmings.push(Stemming {
|
stemmings.push(Stemming {
|
||||||
text: i["text"].to_string().replace("\"", ""),
|
text: i["text"].to_string().replace("\"", ""),
|
||||||
lex: i["analysis"][0]["lex"].to_string().replace("\"", ""),
|
lex: {
|
||||||
grammem: self.detect_grammems(
|
i["analysis"]
|
||||||
i["analysis"][0]["gr"].to_string().replace("\"", ""),
|
.as_array()
|
||||||
)?,
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.map(|z| Lexeme {
|
||||||
|
lex: z["lex"].to_string().replace("\"", ""),
|
||||||
|
grammem: self
|
||||||
|
.detect_grammems(z["gr"].to_string().replace("\"", ""))
|
||||||
|
.unwrap(),
|
||||||
|
weight: z["wt"].as_f64().unwrap_or(1.0),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
Ok(stemmings)
|
Ok(stemmings)
|
||||||
|
Reference in New Issue
Block a user