mirror of
https://github.com/house-of-vanity/mystem-rs.git
synced 2025-07-06 13:14:07 +00:00
Init
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
/target
|
||||
Cargo.lock
|
17
Cargo.toml
Normal file
17
Cargo.toml
Normal file
@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "mystem"
|
||||
version = "0.1.0"
|
||||
authors = ["AB <ab@hexor.ru>"]
|
||||
license = "WTFPL"
|
||||
edition = "2018"
|
||||
description = "Wrapper around Yandex Mystem for Rust."
|
||||
homepage = ""
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
subprocess = "0.2.6"
|
||||
serde_json = "1.0"
|
||||
env_logger = "0.7"
|
||||
log = { version = "^0.4.5", features = ["std"] }
|
||||
failure = "0.1"
|
13
examples/test.rs
Normal file
13
examples/test.rs
Normal file
@ -0,0 +1,13 @@
|
||||
extern crate mystem;
|
||||
|
||||
fn main() -> Result<(), mystem::AppError> {
|
||||
let mut instance = mystem::MyStem::new()?;
|
||||
for stem in instance.stemming("Связался с лучшим - подохни как все.".into())?
|
||||
{
|
||||
println!("{} is a lexeme of {}", stem.lex, stem.text)
|
||||
}
|
||||
|
||||
#[allow(unused_must_use)]
|
||||
instance.terminate();
|
||||
Ok(())
|
||||
}
|
20
src/error.rs
Normal file
20
src/error.rs
Normal file
@ -0,0 +1,20 @@
|
||||
use std::fmt;
|
||||
use subprocess::PopenError;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum AppError {
|
||||
PartOfSpeechError(&'static str),
|
||||
GrammemError(&'static str),
|
||||
PopenError(PopenError),
|
||||
MystemError(&'static str),
|
||||
}
|
||||
impl fmt::Display for AppError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "An error occurred.")
|
||||
}
|
||||
}
|
||||
impl From<PopenError> for AppError {
|
||||
fn from(e: PopenError) -> AppError {
|
||||
return AppError::PopenError(e);
|
||||
}
|
||||
}
|
266
src/grammems.rs
Normal file
266
src/grammems.rs
Normal file
@ -0,0 +1,266 @@
|
||||
use crate::error::*;
|
||||
use crate::Adjective::{Long, Possessive, Short};
|
||||
use crate::Animacy::{Animate, Inanimate};
|
||||
use crate::Case::*;
|
||||
use crate::ComparativeDegree::{Comparative, Superlative};
|
||||
use crate::Gender::{Feminine, Masculine, Neuter};
|
||||
use crate::Mood::{Gerunds, Imperative, Indicative, Infinitive, Participle};
|
||||
use crate::Other::{
|
||||
Abbreviation, Awkward, CommonForm, Distorted, FamilyName, Geo, Informal, Obscene, Obsolete,
|
||||
Parenthesis, Patronymic, Predicative, ProperNoun, Rare,
|
||||
};
|
||||
use crate::PerfectiveAspect::{Imperfective, Perfective};
|
||||
use crate::Plurality::{Plural, Singular};
|
||||
use crate::Tense::{Inpresent, Past, Present};
|
||||
use crate::Transitivity::{Intransitive, Transitive};
|
||||
use crate::VerbPerson::{First, Second, Third};
|
||||
use crate::Voice::{Active, Passive};
|
||||
use std::str::FromStr;
|
||||
|
||||
#[derive(Debug)]
|
||||
/// Represent grammems for [`Stemming`](./struct.Stemming.html)
|
||||
pub struct Grammem {
|
||||
/// Part of speech of [`Stemming`](./struct.Stemming.html)
|
||||
pub part_of_speech: PartOfSpeech,
|
||||
/// Parsed `Vec` of [`Facts`](./enum.Fact.html)
|
||||
pub facts: Vec<Fact>,
|
||||
/// Non-parsed list of grammems from mystem
|
||||
pub facts_raw: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum PartOfSpeech {
|
||||
/// прилагательное
|
||||
A,
|
||||
/// наречие
|
||||
ADV,
|
||||
/// местоименное наречие
|
||||
ADVPRO,
|
||||
/// числительное-прилагательное
|
||||
ANUM,
|
||||
/// местоимение-прилагательное
|
||||
APRO,
|
||||
/// часть композита - сложного слова
|
||||
COM,
|
||||
/// союз
|
||||
CONJ,
|
||||
/// междометие
|
||||
INTJ,
|
||||
/// числительное
|
||||
NUM,
|
||||
/// частица
|
||||
PART,
|
||||
/// предлог
|
||||
PR,
|
||||
/// существительное
|
||||
S,
|
||||
/// местоимение-существительное
|
||||
SPRO,
|
||||
/// глагол
|
||||
V,
|
||||
}
|
||||
impl FromStr for PartOfSpeech {
|
||||
type Err = crate::AppError;
|
||||
fn from_str(input: &str) -> Result<PartOfSpeech, Self::Err> {
|
||||
match input {
|
||||
"A" => Ok(PartOfSpeech::A),
|
||||
"ADV" => Ok(PartOfSpeech::ADV),
|
||||
"ADVPRO" => Ok(PartOfSpeech::ADVPRO),
|
||||
"ANUM" => Ok(PartOfSpeech::ANUM),
|
||||
"APRO" => Ok(PartOfSpeech::APRO),
|
||||
"COM" => Ok(PartOfSpeech::COM),
|
||||
"CONJ" => Ok(PartOfSpeech::CONJ),
|
||||
"INTJ" => Ok(PartOfSpeech::INTJ),
|
||||
"NUM" => Ok(PartOfSpeech::NUM),
|
||||
"PART" => Ok(PartOfSpeech::PART),
|
||||
"PR" => Ok(PartOfSpeech::PR),
|
||||
"S" => Ok(PartOfSpeech::S),
|
||||
"SPRO" => Ok(PartOfSpeech::SPRO),
|
||||
"V" => Ok(PartOfSpeech::V),
|
||||
_ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Fact {
|
||||
Case(Case),
|
||||
Tense(Tense),
|
||||
Plurality(Plurality),
|
||||
Mood(Mood),
|
||||
Adjective(Adjective),
|
||||
ComparativeDegree(ComparativeDegree),
|
||||
Person(VerbPerson),
|
||||
Gender(Gender),
|
||||
PerfectiveAspect(PerfectiveAspect),
|
||||
Voice(Voice),
|
||||
Animacy(Animacy),
|
||||
Transitivity(Transitivity),
|
||||
Other(Other),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Case {
|
||||
Nominative, //именительный
|
||||
Genitive, //родительный
|
||||
Dative, //дательный
|
||||
Accusative, //винительный
|
||||
Instrumental, //творительный
|
||||
Prepositional, //предложный
|
||||
Partitive, //партитив (второй родительный)
|
||||
Locative, //местный (второй предложный)
|
||||
Vocative, //звательный
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Tense {
|
||||
Present, //настоящее
|
||||
Inpresent, //непрошедшее
|
||||
Past, //прошедшее
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Plurality {
|
||||
Plural, //настоящее
|
||||
Singular, //непрошедшее
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Mood {
|
||||
Gerunds, //деепричастие
|
||||
Infinitive, //инфинитив
|
||||
Participle, //причастие
|
||||
Indicative, //изьявительное наклонение
|
||||
Imperative, //повелительное наклонение
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Adjective {
|
||||
Short, //Краткое
|
||||
Long, //Полное
|
||||
Possessive, //притяжательное
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ComparativeDegree {
|
||||
Superlative, //превосходная
|
||||
Comparative, //сравнительная
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum VerbPerson {
|
||||
First, //1-е лицо
|
||||
Second, //2-е лицо
|
||||
Third, //3-е лицо
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Gender {
|
||||
Masculine, //мужской род
|
||||
Feminine, //женский род
|
||||
Neuter, //средний род
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum PerfectiveAspect {
|
||||
Perfective, //совершенный
|
||||
Imperfective, //несовершенный
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Voice {
|
||||
Passive, //страдательный залог
|
||||
Active, //действительный залог
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Animacy {
|
||||
Animate, //одушевленное
|
||||
Inanimate, //неодушевленное
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Transitivity {
|
||||
Transitive, //переходный глагол
|
||||
Intransitive, //непереходный глагол
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Other {
|
||||
Parenthesis, //вводное слово
|
||||
Geo, //географическое название
|
||||
Awkward, //образование формы затруднено
|
||||
ProperNoun, //имя собственное
|
||||
Distorted, //искаженная форма
|
||||
CommonForm, //общая форма мужского и женского рода
|
||||
Obscene, //обсценная лексика
|
||||
Patronymic, //отчество
|
||||
Predicative, //предикатив
|
||||
Informal, //разговорная форма
|
||||
Rare, //редко встречающееся слово
|
||||
Abbreviation, //сокращение
|
||||
Obsolete, //устаревшая форма
|
||||
FamilyName, //фамилия
|
||||
}
|
||||
|
||||
impl FromStr for Fact {
|
||||
type Err = crate::AppError;
|
||||
fn from_str(input: &str) -> Result<Fact, Self::Err> {
|
||||
match input {
|
||||
"nom" => Ok(Fact::Case(Nominative)),
|
||||
"gen" => Ok(Fact::Case(Genitive)),
|
||||
"dat" => Ok(Fact::Case(Dative)),
|
||||
"acc" => Ok(Fact::Case(Accusative)),
|
||||
"ins" => Ok(Fact::Case(Instrumental)),
|
||||
"abl" => Ok(Fact::Case(Prepositional)),
|
||||
"part" => Ok(Fact::Case(Partitive)),
|
||||
"loc" => Ok(Fact::Case(Locative)),
|
||||
"voc" => Ok(Fact::Case(Vocative)),
|
||||
"praes" => Ok(Fact::Tense(Present)),
|
||||
"inpraes" => Ok(Fact::Tense(Inpresent)),
|
||||
"praet" => Ok(Fact::Tense(Past)),
|
||||
"sg" => Ok(Fact::Plurality(Singular)),
|
||||
"pl" => Ok(Fact::Plurality(Plural)),
|
||||
"ger" => Ok(Fact::Mood(Gerunds)),
|
||||
"inf" => Ok(Fact::Mood(Infinitive)),
|
||||
"partcp" => Ok(Fact::Mood(Participle)),
|
||||
"indic" => Ok(Fact::Mood(Indicative)),
|
||||
"imper" => Ok(Fact::Mood(Imperative)),
|
||||
"brev" => Ok(Fact::Adjective(Short)),
|
||||
"plen" => Ok(Fact::Adjective(Long)),
|
||||
"poss" => Ok(Fact::Adjective(Possessive)),
|
||||
"supr" => Ok(Fact::ComparativeDegree(Superlative)),
|
||||
"comp" => Ok(Fact::ComparativeDegree(Comparative)),
|
||||
"1p" => Ok(Fact::Person(First)),
|
||||
"2p" => Ok(Fact::Person(Second)),
|
||||
"3p" => Ok(Fact::Person(Third)),
|
||||
"m" => Ok(Fact::Gender(Masculine)),
|
||||
"f" => Ok(Fact::Gender(Feminine)),
|
||||
"n" => Ok(Fact::Gender(Neuter)),
|
||||
"pf" => Ok(Fact::PerfectiveAspect(Perfective)),
|
||||
"ipf" => Ok(Fact::PerfectiveAspect(Imperfective)),
|
||||
"act" => Ok(Fact::Voice(Active)),
|
||||
"pass" => Ok(Fact::Voice(Passive)),
|
||||
"anim" => Ok(Fact::Animacy(Animate)),
|
||||
"inan" => Ok(Fact::Animacy(Inanimate)),
|
||||
"tran" => Ok(Fact::Transitivity(Transitive)),
|
||||
"intr" => Ok(Fact::Transitivity(Intransitive)),
|
||||
"parenth" => Ok(Fact::Other(Parenthesis)),
|
||||
"geo" => Ok(Fact::Other(Geo)),
|
||||
"awkw" => Ok(Fact::Other(Awkward)),
|
||||
"persn" => Ok(Fact::Other(ProperNoun)),
|
||||
"dist" => Ok(Fact::Other(Distorted)),
|
||||
"mf" => Ok(Fact::Other(CommonForm)),
|
||||
"obsc" => Ok(Fact::Other(Obscene)),
|
||||
"patrn" => Ok(Fact::Other(Patronymic)),
|
||||
"praed" => Ok(Fact::Other(Predicative)),
|
||||
"inform" => Ok(Fact::Other(Informal)),
|
||||
"rare" => Ok(Fact::Other(Rare)),
|
||||
"abbr" => Ok(Fact::Other(Abbreviation)),
|
||||
"obsol" => Ok(Fact::Other(Obsolete)),
|
||||
"famn" => Ok(Fact::Other(FamilyName)),
|
||||
//_ => Ok(Fact::Case(Vocative)),
|
||||
_ => Err(AppError::GrammemError("Failed to get Grammem.")),
|
||||
}
|
||||
}
|
||||
}
|
133
src/lib.rs
Normal file
133
src/lib.rs
Normal file
@ -0,0 +1,133 @@
|
||||
#![crate_name = "mystem"]
|
||||
mod error;
|
||||
mod grammems;
|
||||
|
||||
use serde_json::Value;
|
||||
use std::io::{prelude::*, BufReader, Error, Write};
|
||||
use std::str::FromStr;
|
||||
use subprocess::{Popen, PopenConfig, PopenError, Redirection};
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
pub use error::*;
|
||||
pub use grammems::*;
|
||||
|
||||
/// A Mystem process represented here
|
||||
#[derive(Debug)]
|
||||
pub struct MyStem {
|
||||
pub process: Popen,
|
||||
}
|
||||
|
||||
/// Stemmed result
|
||||
#[derive(Debug)]
|
||||
pub struct Stemming {
|
||||
/// Original word
|
||||
pub text: String,
|
||||
/// Detected lexeme
|
||||
pub lex: String,
|
||||
/// Detected grammems
|
||||
pub grammem: Grammem,
|
||||
}
|
||||
|
||||
impl MyStem {
|
||||
/// Returns a MyStem instance with running process
|
||||
/// of mystem binary. It keeps mystem running all the time
|
||||
/// and reuse it.
|
||||
pub fn new() -> Result<Self, AppError> {
|
||||
let p = MyStem::open_process()?;
|
||||
debug!("Mystem started with PID {}", p.pid().unwrap());
|
||||
Ok(Self { process: p })
|
||||
}
|
||||
|
||||
fn open_process() -> Result<Popen, PopenError> {
|
||||
Popen::create(
|
||||
&["mystem", "-d", "-i", "--format", "json", "--eng-gr"],
|
||||
PopenConfig {
|
||||
stdout: Redirection::Pipe,
|
||||
stdin: Redirection::Pipe,
|
||||
..Default::default()
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Terminate mystem instance.
|
||||
#[allow(dead_code)]
|
||||
pub fn terminate(&mut self) -> Result<(), Error> {
|
||||
self.process.terminate()
|
||||
}
|
||||
|
||||
fn detect_grammems(&mut self, gr: String) -> Result<Grammem, AppError> {
|
||||
let mut res: Vec<String> = gr
|
||||
.split(|s| s == '=' || s == ',')
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
res.retain(|x| x != "");
|
||||
Ok(Grammem {
|
||||
part_of_speech: PartOfSpeech::from_str(res[0].as_str())?,
|
||||
facts: res
|
||||
.clone()
|
||||
.split_off(1)
|
||||
.iter_mut()
|
||||
.map(|f| Fact::from_str(f).unwrap())
|
||||
.collect(),
|
||||
facts_raw: res.split_off(1),
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns `Vec` with [`mystem::Stemming`](./struct.Stemming.html) for each word in `text`
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust
|
||||
/// let mut instance = mystem::MyStem::new()?;
|
||||
/// for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? {
|
||||
/// println!("{} is a lexeme of {}", stem.lex, stem.text)
|
||||
/// }
|
||||
/// // связываться is a lexeme of Связался
|
||||
/// // с is a lexeme of с
|
||||
/// // хороший is a lexeme of лучшим
|
||||
/// // подыхать is a lexeme of подохни
|
||||
/// // как is a lexeme of как
|
||||
/// // все is a lexeme of все
|
||||
/// ```
|
||||
pub fn stemming(&mut self, text: String) -> Result<Vec<Stemming>, AppError> {
|
||||
if let Some(exit_status) = self.process.poll() {
|
||||
warn!(
|
||||
"MyStem process ({:?}) exited with: {:?}. Restarting...",
|
||||
self.process.pid().unwrap(),
|
||||
exit_status
|
||||
);
|
||||
self.process = MyStem::open_process()?;
|
||||
}
|
||||
let clean_text = format!("{}{}", text.trim(), "\n");
|
||||
self.process
|
||||
.stdin
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.write(clean_text.as_bytes());
|
||||
let mut contents = String::new();
|
||||
let mut buf_reader = BufReader::new(self.process.stdout.as_ref().unwrap());
|
||||
buf_reader.read_line(&mut contents);
|
||||
|
||||
let mut stemmings: Vec<Stemming> = Vec::new();
|
||||
match Some(contents) {
|
||||
Some(contents) => {
|
||||
let v: Vec<Value> = match serde_json::from_str(contents.as_str()) {
|
||||
Ok(val) => val,
|
||||
Err(_) => return Ok(vec![]),
|
||||
};
|
||||
for i in v {
|
||||
stemmings.push(Stemming {
|
||||
text: i["text"].to_string().replace("\"", ""),
|
||||
lex: i["analysis"][0]["lex"].to_string().replace("\"", ""),
|
||||
grammem: self.detect_grammems(
|
||||
i["analysis"][0]["gr"].to_string().replace("\"", ""),
|
||||
)?,
|
||||
});
|
||||
}
|
||||
Ok(stemmings)
|
||||
}
|
||||
None => return Ok(vec![]),
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user