mirror of
https://github.com/house-of-vanity/mystem-rs.git
synced 2025-07-06 13:14:07 +00:00
Impl Display for Grammems structs. Added readme. Reworked lexeme detection - now stores all the lexemes order by weight.
This commit is contained in:
@ -1,18 +1,17 @@
|
||||
[package]
|
||||
name = "mystem"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
authors = ["AB <ab@hexor.ru>"]
|
||||
license = "WTFPL"
|
||||
edition = "2018"
|
||||
description = "Wrapper around Yandex Mystem for Rust."
|
||||
homepage = "https://github.com/house-of-vanity/mystem-rs"
|
||||
repository = "https://github.com/house-of-vanity/mystem-rs"
|
||||
readme = "README.md"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
subprocess = "0.2.6"
|
||||
serde_json = "1.0"
|
||||
env_logger = "0.7"
|
||||
log = { version = "^0.4.5", features = ["std"] }
|
||||
failure = "0.1"
|
29
README.md
Normal file
29
README.md
Normal file
@ -0,0 +1,29 @@
|
||||
# MyStem Rust Wrapper
|
||||
|
||||
Rust wrapper for the Yandex MyStem 3.1 morpholocial analyzer of the Russian language.
|
||||
|
||||
#### System Requrements
|
||||
The wrapper was tested on Ubuntu Linux 18.04+, Windows 10.
|
||||
Mystem binary should be accessible via PATH so manual installation is required. [MyStem Web Site](https://yandex.ru/dev/mystem/)
|
||||
|
||||
###A Quick Example
|
||||
|
||||
```rust
|
||||
let mut instance = mystem::MyStem::new()?;
|
||||
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? {
|
||||
println!(
|
||||
"'{}' most likely is a '{}' and lexeme is '{}'.",
|
||||
stem.text,
|
||||
stem.lex[0].grammem.part_of_speech,
|
||||
stem.lex[0].lex
|
||||
)
|
||||
}
|
||||
|
||||
//'Связался' most likely is a 'Verb' and lexeme is 'связываться'.
|
||||
//'с' most likely is a 'Preposition' and lexeme is 'с'.
|
||||
//'лучшим' most likely is a 'Adjective' and lexeme is 'хороший'.
|
||||
//'подохни' most likely is a 'Verb' and lexeme is 'подыхать'.
|
||||
//'как' most likely is a 'Conjunction' and lexeme is 'как'.
|
||||
//'все' most likely is a 'AdjectivePronoun' and lexeme is 'весь'.
|
||||
|
||||
```
|
@ -1,13 +1,37 @@
|
||||
extern crate mystem;
|
||||
|
||||
use mystem::Other::Obscene;
|
||||
|
||||
#[allow(unused_must_use)]
|
||||
fn main() -> Result<(), mystem::AppError> {
|
||||
let mut instance = mystem::MyStem::new()?;
|
||||
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())?
|
||||
for stem in instance.stemming("Связался с лучшим - подохни как все, Говноед.".into())?
|
||||
{
|
||||
println!("{} is a lexeme of {}", stem.lex, stem.text)
|
||||
println!(
|
||||
"'{}' most likely is a '{}' and lexeme is '{}'.{}{}",
|
||||
stem.text,
|
||||
stem.lex[0].grammem.part_of_speech,
|
||||
stem.lex[0].lex,
|
||||
{
|
||||
match stem.lex[0]
|
||||
.grammem
|
||||
.facts
|
||||
.contains(&mystem::Fact::Other(Obscene))
|
||||
{
|
||||
true => " Obscene lexis.",
|
||||
false => "",
|
||||
}
|
||||
},
|
||||
{
|
||||
match stem.lex.len()
|
||||
{
|
||||
0|1 => "".to_string(),
|
||||
x if x > 1 => format!(" Also has {} found lexems.", x),
|
||||
_ => unreachable!()
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
instance.terminate();
|
||||
Ok(())
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ use crate::Tense::{Inpresent, Past, Present};
|
||||
use crate::Transitivity::{Intransitive, Transitive};
|
||||
use crate::VerbPerson::{First, Second, Third};
|
||||
use crate::Voice::{Active, Passive};
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
|
||||
#[derive(Debug)]
|
||||
@ -31,58 +32,63 @@ pub struct Grammem {
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum PartOfSpeech {
|
||||
/// прилагательное
|
||||
A,
|
||||
Adjective,
|
||||
/// наречие
|
||||
ADV,
|
||||
Adverb,
|
||||
/// местоименное наречие
|
||||
ADVPRO,
|
||||
AdverbPronominal,
|
||||
/// числительное-прилагательное
|
||||
ANUM,
|
||||
AdjectiveNumeral,
|
||||
/// местоимение-прилагательное
|
||||
APRO,
|
||||
AdjectivePronoun,
|
||||
/// часть композита - сложного слова
|
||||
COM,
|
||||
Composite,
|
||||
/// союз
|
||||
CONJ,
|
||||
Conjunction,
|
||||
/// междометие
|
||||
INTJ,
|
||||
Interjection,
|
||||
/// числительное
|
||||
NUM,
|
||||
Numeral,
|
||||
/// частица
|
||||
PART,
|
||||
Particle,
|
||||
/// предлог
|
||||
PR,
|
||||
Preposition,
|
||||
/// существительное
|
||||
S,
|
||||
Noun,
|
||||
/// местоимение-существительное
|
||||
SPRO,
|
||||
AdjectiveNoun,
|
||||
/// глагол
|
||||
V,
|
||||
Verb,
|
||||
}
|
||||
impl FromStr for PartOfSpeech {
|
||||
type Err = crate::AppError;
|
||||
fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
|
||||
match input {
|
||||
"A" => Ok(PartOfSpeech::A),
|
||||
"ADV" => Ok(PartOfSpeech::ADV),
|
||||
"ADVPRO" => Ok(PartOfSpeech::ADVPRO),
|
||||
"ANUM" => Ok(PartOfSpeech::ANUM),
|
||||
"APRO" => Ok(PartOfSpeech::APRO),
|
||||
"COM" => Ok(PartOfSpeech::COM),
|
||||
"CONJ" => Ok(PartOfSpeech::CONJ),
|
||||
"INTJ" => Ok(PartOfSpeech::INTJ),
|
||||
"NUM" => Ok(PartOfSpeech::NUM),
|
||||
"PART" => Ok(PartOfSpeech::PART),
|
||||
"PR" => Ok(PartOfSpeech::PR),
|
||||
"S" => Ok(PartOfSpeech::S),
|
||||
"SPRO" => Ok(PartOfSpeech::SPRO),
|
||||
"V" => Ok(PartOfSpeech::V),
|
||||
"A" => Ok(PartOfSpeech::Adjective),
|
||||
"ADV" => Ok(PartOfSpeech::Adverb),
|
||||
"ADVPRO" => Ok(PartOfSpeech::AdverbPronominal),
|
||||
"ANUM" => Ok(PartOfSpeech::AdjectiveNumeral),
|
||||
"APRO" => Ok(PartOfSpeech::AdjectivePronoun),
|
||||
"COM" => Ok(PartOfSpeech::Composite),
|
||||
"CONJ" => Ok(PartOfSpeech::Conjunction),
|
||||
"INTJ" => Ok(PartOfSpeech::Interjection),
|
||||
"NUM" => Ok(PartOfSpeech::Numeral),
|
||||
"PART" => Ok(PartOfSpeech::Particle),
|
||||
"PR" => Ok(PartOfSpeech::Preposition),
|
||||
"S" => Ok(PartOfSpeech::Noun),
|
||||
"SPRO" => Ok(PartOfSpeech::AdjectiveNoun),
|
||||
"V" => Ok(PartOfSpeech::Verb),
|
||||
_ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl fmt::Display for PartOfSpeech {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Fact {
|
||||
Case(Case),
|
||||
Tense(Tense),
|
||||
@ -98,8 +104,13 @@ pub enum Fact {
|
||||
Transitivity(Transitivity),
|
||||
Other(Other),
|
||||
}
|
||||
impl fmt::Display for Fact {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Case {
|
||||
Nominative, //именительный
|
||||
Genitive, //родительный
|
||||
@ -112,20 +123,20 @@ pub enum Case {
|
||||
Vocative, //звательный
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Tense {
|
||||
Present, //настоящее
|
||||
Inpresent, //непрошедшее
|
||||
Past, //прошедшее
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Plurality {
|
||||
Plural, //настоящее
|
||||
Singular, //непрошедшее
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Mood {
|
||||
Gerunds, //деепричастие
|
||||
Infinitive, //инфинитив
|
||||
@ -134,58 +145,58 @@ pub enum Mood {
|
||||
Imperative, //повелительное наклонение
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Adjective {
|
||||
Short, //Краткое
|
||||
Long, //Полное
|
||||
Possessive, //притяжательное
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum ComparativeDegree {
|
||||
Superlative, //превосходная
|
||||
Comparative, //сравнительная
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum VerbPerson {
|
||||
First, //1-е лицо
|
||||
Second, //2-е лицо
|
||||
Third, //3-е лицо
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Gender {
|
||||
Masculine, //мужской род
|
||||
Feminine, //женский род
|
||||
Neuter, //средний род
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum PerfectiveAspect {
|
||||
Perfective, //совершенный
|
||||
Imperfective, //несовершенный
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Voice {
|
||||
Passive, //страдательный залог
|
||||
Active, //действительный залог
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Animacy {
|
||||
Animate, //одушевленное
|
||||
Inanimate, //неодушевленное
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Transitivity {
|
||||
Transitive, //переходный глагол
|
||||
Intransitive, //непереходный глагол
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Other {
|
||||
Parenthesis, //вводное слово
|
||||
Geo, //географическое название
|
||||
|
39
src/lib.rs
39
src/lib.rs
@ -12,21 +12,30 @@ extern crate log;
|
||||
pub use error::*;
|
||||
pub use grammems::*;
|
||||
|
||||
/// A Mystem process represented here
|
||||
/// A Mystem process representation
|
||||
#[derive(Debug)]
|
||||
pub struct MyStem {
|
||||
pub process: Popen,
|
||||
}
|
||||
|
||||
/// Stemmed result
|
||||
/// Lexeme struct
|
||||
#[derive(Debug)]
|
||||
pub struct Stemming {
|
||||
/// Original word
|
||||
pub text: String,
|
||||
pub struct Lexeme {
|
||||
/// Detected lexeme
|
||||
pub lex: String,
|
||||
/// Detected grammems
|
||||
pub grammem: Grammem,
|
||||
/// Wight of Lexeme
|
||||
pub weight: f64,
|
||||
}
|
||||
|
||||
/// Stemmed result containing `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html)
|
||||
#[derive(Debug)]
|
||||
pub struct Stemming {
|
||||
/// Original word
|
||||
pub text: String,
|
||||
/// `Vec` of [`mystem::Lexeme`](./struct.Lexeme.html) of `text`.
|
||||
pub lex: Vec<Lexeme>,
|
||||
}
|
||||
|
||||
impl MyStem {
|
||||
@ -41,7 +50,7 @@ impl MyStem {
|
||||
|
||||
fn open_process() -> Result<Popen, PopenError> {
|
||||
Popen::create(
|
||||
&["mystem", "-d", "-i", "--format", "json", "--eng-gr"],
|
||||
&["mystem", "-i", "--format", "json", "--eng-gr", "--weight"],
|
||||
PopenConfig {
|
||||
stdout: Redirection::Pipe,
|
||||
stdin: Redirection::Pipe,
|
||||
@ -119,10 +128,20 @@ impl MyStem {
|
||||
for i in v {
|
||||
stemmings.push(Stemming {
|
||||
text: i["text"].to_string().replace("\"", ""),
|
||||
lex: i["analysis"][0]["lex"].to_string().replace("\"", ""),
|
||||
grammem: self.detect_grammems(
|
||||
i["analysis"][0]["gr"].to_string().replace("\"", ""),
|
||||
)?,
|
||||
lex: {
|
||||
i["analysis"]
|
||||
.as_array()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|z| Lexeme {
|
||||
lex: z["lex"].to_string().replace("\"", ""),
|
||||
grammem: self
|
||||
.detect_grammems(z["gr"].to_string().replace("\"", ""))
|
||||
.unwrap(),
|
||||
weight: z["wt"].as_f64().unwrap_or(1.0),
|
||||
})
|
||||
.collect()
|
||||
},
|
||||
});
|
||||
}
|
||||
Ok(stemmings)
|
||||
|
Reference in New Issue
Block a user