commit e421d6f38226afd65aabd83648bb6504c40f172e Author: AB Date: Sun Dec 27 23:04:09 2020 +0300 Init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1b2679e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "mystem" +version = "0.1.0" +authors = ["AB "] +license = "WTFPL" +edition = "2018" +description = "Wrapper around Yandex Mystem for Rust." +homepage = "" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +subprocess = "0.2.6" +serde_json = "1.0" +env_logger = "0.7" +log = { version = "^0.4.5", features = ["std"] } +failure = "0.1" \ No newline at end of file diff --git a/examples/test.rs b/examples/test.rs new file mode 100644 index 0000000..791d725 --- /dev/null +++ b/examples/test.rs @@ -0,0 +1,13 @@ +extern crate mystem; + +fn main() -> Result<(), mystem::AppError> { + let mut instance = mystem::MyStem::new()?; + for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? + { + println!("{} is a lexeme of {}", stem.lex, stem.text) + } + + #[allow(unused_must_use)] + instance.terminate(); + Ok(()) +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..dee01aa --- /dev/null +++ b/src/error.rs @@ -0,0 +1,20 @@ +use std::fmt; +use subprocess::PopenError; + +#[derive(Debug)] +pub enum AppError { + PartOfSpeechError(&'static str), + GrammemError(&'static str), + PopenError(PopenError), + MystemError(&'static str), +} +impl fmt::Display for AppError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "An error occurred.") + } +} +impl From for AppError { + fn from(e: PopenError) -> AppError { + return AppError::PopenError(e); + } +} diff --git a/src/grammems.rs b/src/grammems.rs new file mode 100644 index 0000000..183931c --- /dev/null +++ b/src/grammems.rs @@ -0,0 +1,266 @@ +use crate::error::*; +use crate::Adjective::{Long, Possessive, Short}; +use crate::Animacy::{Animate, Inanimate}; +use crate::Case::*; +use crate::ComparativeDegree::{Comparative, Superlative}; +use crate::Gender::{Feminine, Masculine, Neuter}; +use crate::Mood::{Gerunds, Imperative, Indicative, Infinitive, Participle}; +use crate::Other::{ + Abbreviation, Awkward, CommonForm, Distorted, FamilyName, Geo, Informal, Obscene, Obsolete, + Parenthesis, Patronymic, Predicative, ProperNoun, Rare, +}; +use crate::PerfectiveAspect::{Imperfective, Perfective}; +use crate::Plurality::{Plural, Singular}; +use crate::Tense::{Inpresent, Past, Present}; +use crate::Transitivity::{Intransitive, Transitive}; +use crate::VerbPerson::{First, Second, Third}; +use crate::Voice::{Active, Passive}; +use std::str::FromStr; + +#[derive(Debug)] +/// Represent grammems for [`Stemming`](./struct.Stemming.html) +pub struct Grammem { + /// Part of speech of [`Stemming`](./struct.Stemming.html) + pub part_of_speech: PartOfSpeech, + /// Parsed `Vec` of [`Facts`](./enum.Fact.html) + pub facts: Vec, + /// Non-parsed list of grammems from mystem + pub facts_raw: Vec, +} + +#[derive(Debug, PartialEq)] +pub enum PartOfSpeech { + /// прилагательное + A, + /// наречие + ADV, + /// местоименное наречие + ADVPRO, + /// числительное-прилагательное + ANUM, + /// местоимение-прилагательное + APRO, + /// часть композита - сложного слова + COM, + /// союз + CONJ, + /// междометие + INTJ, + /// числительное + NUM, + /// частица + PART, + /// предлог + PR, + /// существительное + S, + /// местоимение-существительное + SPRO, + /// глагол + V, +} +impl FromStr for PartOfSpeech { + type Err = crate::AppError; + fn from_str(input: &str) -> Result { + match input { + "A" => Ok(PartOfSpeech::A), + "ADV" => Ok(PartOfSpeech::ADV), + "ADVPRO" => Ok(PartOfSpeech::ADVPRO), + "ANUM" => Ok(PartOfSpeech::ANUM), + "APRO" => Ok(PartOfSpeech::APRO), + "COM" => Ok(PartOfSpeech::COM), + "CONJ" => Ok(PartOfSpeech::CONJ), + "INTJ" => Ok(PartOfSpeech::INTJ), + "NUM" => Ok(PartOfSpeech::NUM), + "PART" => Ok(PartOfSpeech::PART), + "PR" => Ok(PartOfSpeech::PR), + "S" => Ok(PartOfSpeech::S), + "SPRO" => Ok(PartOfSpeech::SPRO), + "V" => Ok(PartOfSpeech::V), + _ => Err(AppError::PartOfSpeechError("Failed to get Part of Speech.")), + } + } +} + +#[derive(Debug)] +pub enum Fact { + Case(Case), + Tense(Tense), + Plurality(Plurality), + Mood(Mood), + Adjective(Adjective), + ComparativeDegree(ComparativeDegree), + Person(VerbPerson), + Gender(Gender), + PerfectiveAspect(PerfectiveAspect), + Voice(Voice), + Animacy(Animacy), + Transitivity(Transitivity), + Other(Other), +} + +#[derive(Debug)] +pub enum Case { + Nominative, //именительный + Genitive, //родительный + Dative, //дательный + Accusative, //винительный + Instrumental, //творительный + Prepositional, //предложный + Partitive, //партитив (второй родительный) + Locative, //местный (второй предложный) + Vocative, //звательный +} + +#[derive(Debug)] +pub enum Tense { + Present, //настоящее + Inpresent, //непрошедшее + Past, //прошедшее +} + +#[derive(Debug)] +pub enum Plurality { + Plural, //настоящее + Singular, //непрошедшее +} + +#[derive(Debug)] +pub enum Mood { + Gerunds, //деепричастие + Infinitive, //инфинитив + Participle, //причастие + Indicative, //изьявительное наклонение + Imperative, //повелительное наклонение +} + +#[derive(Debug)] +pub enum Adjective { + Short, //Краткое + Long, //Полное + Possessive, //притяжательное +} + +#[derive(Debug)] +pub enum ComparativeDegree { + Superlative, //превосходная + Comparative, //сравнительная +} + +#[derive(Debug)] +pub enum VerbPerson { + First, //1-е лицо + Second, //2-е лицо + Third, //3-е лицо +} + +#[derive(Debug)] +pub enum Gender { + Masculine, //мужской род + Feminine, //женский род + Neuter, //средний род +} + +#[derive(Debug)] +pub enum PerfectiveAspect { + Perfective, //совершенный + Imperfective, //несовершенный +} + +#[derive(Debug)] +pub enum Voice { + Passive, //страдательный залог + Active, //действительный залог +} + +#[derive(Debug)] +pub enum Animacy { + Animate, //одушевленное + Inanimate, //неодушевленное +} + +#[derive(Debug)] +pub enum Transitivity { + Transitive, //переходный глагол + Intransitive, //непереходный глагол +} + +#[derive(Debug)] +pub enum Other { + Parenthesis, //вводное слово + Geo, //географическое название + Awkward, //образование формы затруднено + ProperNoun, //имя собственное + Distorted, //искаженная форма + CommonForm, //общая форма мужского и женского рода + Obscene, //обсценная лексика + Patronymic, //отчество + Predicative, //предикатив + Informal, //разговорная форма + Rare, //редко встречающееся слово + Abbreviation, //сокращение + Obsolete, //устаревшая форма + FamilyName, //фамилия +} + +impl FromStr for Fact { + type Err = crate::AppError; + fn from_str(input: &str) -> Result { + match input { + "nom" => Ok(Fact::Case(Nominative)), + "gen" => Ok(Fact::Case(Genitive)), + "dat" => Ok(Fact::Case(Dative)), + "acc" => Ok(Fact::Case(Accusative)), + "ins" => Ok(Fact::Case(Instrumental)), + "abl" => Ok(Fact::Case(Prepositional)), + "part" => Ok(Fact::Case(Partitive)), + "loc" => Ok(Fact::Case(Locative)), + "voc" => Ok(Fact::Case(Vocative)), + "praes" => Ok(Fact::Tense(Present)), + "inpraes" => Ok(Fact::Tense(Inpresent)), + "praet" => Ok(Fact::Tense(Past)), + "sg" => Ok(Fact::Plurality(Singular)), + "pl" => Ok(Fact::Plurality(Plural)), + "ger" => Ok(Fact::Mood(Gerunds)), + "inf" => Ok(Fact::Mood(Infinitive)), + "partcp" => Ok(Fact::Mood(Participle)), + "indic" => Ok(Fact::Mood(Indicative)), + "imper" => Ok(Fact::Mood(Imperative)), + "brev" => Ok(Fact::Adjective(Short)), + "plen" => Ok(Fact::Adjective(Long)), + "poss" => Ok(Fact::Adjective(Possessive)), + "supr" => Ok(Fact::ComparativeDegree(Superlative)), + "comp" => Ok(Fact::ComparativeDegree(Comparative)), + "1p" => Ok(Fact::Person(First)), + "2p" => Ok(Fact::Person(Second)), + "3p" => Ok(Fact::Person(Third)), + "m" => Ok(Fact::Gender(Masculine)), + "f" => Ok(Fact::Gender(Feminine)), + "n" => Ok(Fact::Gender(Neuter)), + "pf" => Ok(Fact::PerfectiveAspect(Perfective)), + "ipf" => Ok(Fact::PerfectiveAspect(Imperfective)), + "act" => Ok(Fact::Voice(Active)), + "pass" => Ok(Fact::Voice(Passive)), + "anim" => Ok(Fact::Animacy(Animate)), + "inan" => Ok(Fact::Animacy(Inanimate)), + "tran" => Ok(Fact::Transitivity(Transitive)), + "intr" => Ok(Fact::Transitivity(Intransitive)), + "parenth" => Ok(Fact::Other(Parenthesis)), + "geo" => Ok(Fact::Other(Geo)), + "awkw" => Ok(Fact::Other(Awkward)), + "persn" => Ok(Fact::Other(ProperNoun)), + "dist" => Ok(Fact::Other(Distorted)), + "mf" => Ok(Fact::Other(CommonForm)), + "obsc" => Ok(Fact::Other(Obscene)), + "patrn" => Ok(Fact::Other(Patronymic)), + "praed" => Ok(Fact::Other(Predicative)), + "inform" => Ok(Fact::Other(Informal)), + "rare" => Ok(Fact::Other(Rare)), + "abbr" => Ok(Fact::Other(Abbreviation)), + "obsol" => Ok(Fact::Other(Obsolete)), + "famn" => Ok(Fact::Other(FamilyName)), + //_ => Ok(Fact::Case(Vocative)), + _ => Err(AppError::GrammemError("Failed to get Grammem.")), + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..01caa0f --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,133 @@ +#![crate_name = "mystem"] +mod error; +mod grammems; + +use serde_json::Value; +use std::io::{prelude::*, BufReader, Error, Write}; +use std::str::FromStr; +use subprocess::{Popen, PopenConfig, PopenError, Redirection}; +#[macro_use] +extern crate log; +use std::collections::{HashMap, HashSet}; + +pub use error::*; +pub use grammems::*; + +/// A Mystem process represented here +#[derive(Debug)] +pub struct MyStem { + pub process: Popen, +} + +/// Stemmed result +#[derive(Debug)] +pub struct Stemming { + /// Original word + pub text: String, + /// Detected lexeme + pub lex: String, + /// Detected grammems + pub grammem: Grammem, +} + +impl MyStem { + /// Returns a MyStem instance with running process + /// of mystem binary. It keeps mystem running all the time + /// and reuse it. + pub fn new() -> Result { + let p = MyStem::open_process()?; + debug!("Mystem started with PID {}", p.pid().unwrap()); + Ok(Self { process: p }) + } + + fn open_process() -> Result { + Popen::create( + &["mystem", "-d", "-i", "--format", "json", "--eng-gr"], + PopenConfig { + stdout: Redirection::Pipe, + stdin: Redirection::Pipe, + ..Default::default() + }, + ) + } + + /// Terminate mystem instance. + #[allow(dead_code)] + pub fn terminate(&mut self) -> Result<(), Error> { + self.process.terminate() + } + + fn detect_grammems(&mut self, gr: String) -> Result { + let mut res: Vec = gr + .split(|s| s == '=' || s == ',') + .map(|s| s.to_string()) + .collect(); + res.retain(|x| x != ""); + Ok(Grammem { + part_of_speech: PartOfSpeech::from_str(res[0].as_str())?, + facts: res + .clone() + .split_off(1) + .iter_mut() + .map(|f| Fact::from_str(f).unwrap()) + .collect(), + facts_raw: res.split_off(1), + }) + } + + /// Returns `Vec` with [`mystem::Stemming`](./struct.Stemming.html) for each word in `text` + /// # Examples + /// + /// ```rust + /// let mut instance = mystem::MyStem::new()?; + /// for stem in instance.stemming("Связался с лучшим - подохни как все.".into())? { + /// println!("{} is a lexeme of {}", stem.lex, stem.text) + /// } + /// // связываться is a lexeme of Связался + /// // с is a lexeme of с + /// // хороший is a lexeme of лучшим + /// // подыхать is a lexeme of подохни + /// // как is a lexeme of как + /// // все is a lexeme of все + /// ``` + pub fn stemming(&mut self, text: String) -> Result, AppError> { + if let Some(exit_status) = self.process.poll() { + warn!( + "MyStem process ({:?}) exited with: {:?}. Restarting...", + self.process.pid().unwrap(), + exit_status + ); + self.process = MyStem::open_process()?; + } + let clean_text = format!("{}{}", text.trim(), "\n"); + self.process + .stdin + .as_ref() + .unwrap() + .write(clean_text.as_bytes()); + let mut contents = String::new(); + let mut buf_reader = BufReader::new(self.process.stdout.as_ref().unwrap()); + buf_reader.read_line(&mut contents); + + let mut stemmings: Vec = Vec::new(); + match Some(contents) { + Some(contents) => { + let v: Vec = match serde_json::from_str(contents.as_str()) { + Ok(val) => val, + Err(_) => return Ok(vec![]), + }; + for i in v { + stemmings.push(Stemming { + text: i["text"].to_string().replace("\"", ""), + lex: i["analysis"][0]["lex"].to_string().replace("\"", ""), + grammem: self.detect_grammems( + i["analysis"][0]["gr"].to_string().replace("\"", ""), + )?, + }); + } + Ok(stemmings) + } + None => return Ok(vec![]), + } + } +}