Tomita одесса

Embed Size (px)

Citation preview

  • 1. , 13 2013 , -

2. 3. - GLR (http://ru.wikipedia.org/wiki/GLR-) - , . - . - . 4. - http://nlpseminar.ru/lecture69/ - http://nlpseminar.ru/tomita.zip 5. ? 6. -?tomitaparser.exe - | | 7. , 8. ? .tar STDIN 9. "" . (Titanic) - 1997, , . ( ) (). (Intouchables) 2011 , . , . input.txt 10. , , -> , .S -> Noun; 11. 12. #encoding "utf-8"#GRAMMAR_ROOT SS -> Noun; film.cxx 13. encoding "utf8";import "base.proto"; import "articles_base.proto"; TAuxDicArticle "" { key = { "tomita:film.cxx" type=CUSTOM }} mydic.gzt 14. ? - Input (File, ...) ? - Input.Encoding ? - Output ? - + ? - Articles ? - Facts 15. encoding "utf8";TTextMinerConfig {Dictionary = "mydic.gzt"; ( )Input = {File = "input.txt"} Output = {File = "output.txt" Format = text} Articles = [ { Name = "" } ]} config.proto 16. !Windows:tomitaparser.exe config.protoLinux / *BSD (bash)./tomitaparser config.proto 17. "" ."" ( Titanic ) - 1997 , , "" . ( ) ( ) ." ( Intouchables ) 2011 , . , "" . output.txt 18. 19. encoding "utf8";TTextMinerConfig {Dictionary = "mydic.gzt";PrettyOutput = "pretty.html";Input = {File = "input.txt"}Output = {File = "output.txt"Format = text}Articles = [ { Name = "" } ]} config.proto 20. pretty.html 21. : h-reg1, h-reg2, h-reg3, l-reg : mw : fw : rt - 22. 23. #encoding "utf-8"#GRAMMAR_ROOT SName -> Word;Name -> Word Word;Name -> Word WordWord;S -> Name; film.cxx 24. 25. * 0 S -> Adj* Noun;=S -> Noun;S -> Adj Noun;S -> Adj Adj Noun; 26. + 1 S -> Adj+ Noun;=S -> Adj Noun;S -> Adj Adj Noun;S -> Adj Adj Adj Noun; 27. () 0 1 S -> (Adj) Noun;=S -> Noun;S -> Adj Noun; 28. #encoding "utf-8"#GRAMMAR_ROOT SName -> Word Word*;S -> Name; film.cxx 29. 30. import "base.proto"; import "facttypes_base.proto"; message Film: NFactType.TFact {required string Title = 1; } facttypes.proto 31. encoding "utf8";import "base.proto";import "articles_base.proto";import "facttypes.proto";TAuxDicArticle ""{key = { "tomita:film.cxx" type=CUSTOM }} mydic.gzt 32. encoding "utf8";TTextMinerConfig {Dictionary = "mydic.gzt"PrettyOutput = "pretty.html"Input = {File = "input.txt"}Output = {File = "output.txt" Format = text}Articles = [ { Name = "" } ]Facts = [ { Name = "Film" } ]} config.proto 33. #encoding "utf-8"#GRAMMAR_ROOT SName -> Word Word*;S -> Name interp (Film.Title); film.cxx 34. "" .Film{Name = "}"" ( Titanic ) - 1997 , , "" .Film{Name = Titanic}Film{Name = 1997}Film{Name = } output.txt 35. ( ) ( ) .Film { Name = }Film { Name = }Film { Name = }Film { Name = } output.txt 36. pretty.html 37. ?,,? ? 38. 39. , : gnc-agr : nc-agr : gc-agr : c-agr 40. #encoding "utf-8"#GRAMMAR_ROOT SName -> Word Word*;S -> Name interp (Film.Title); film.cxx 41. pretty.html 42. 43. #encoding "utf-8"#GRAMMAR_ROOT S//Name -> Word Word*;FilmName -> Word;FilmName -> Word Word* Word;S -> FilmName interp (Film.Title); film.cxx 44. pretty.html 45. 1. 2. ? 46. film.cxx #encoding "utf-8"#GRAMMAR_ROOT S//Name -> Word Word*;FilmName -> Word;FilmName -> Word Word* Word;S -> FilmName interp (Film.Title::not_norm); 47. pretty.html 48. , . . 49. 50. encoding "utf8";TAuxDicArticle ""{key = ""key = " "key = " "key = " "key = "-"key = ""} genre.gzt 51. encoding "utf8";import "base.proto";import "articles_base.proto";import "facttypes.proto";import "genre.gzt";TAuxDicArticle ""{key = { "tomita:film.cxx" type=CUSTOM }} mydic.gzt 52. "" ."" (Titanic) - 1997 , , . ( ) ( ).(Intouchables) 2011 , . , . 53. 1. . 2. 3. , : 54. #encoding "utf-8"#GRAMMAR_ROOT S//Name -> Word Word*;FilmName -> Word;FilmName -> Word Word* Word;OriginalName -> Word Word*;Genre -> Word;Film -> ;Descr -> Genre | Film;S -> Descr FilmName interp (Film.Title::not_norm) (LBracket)(OriginalName) (RBracket);S -> FilmName interp (Film.Title::not_norm) (LBracket) (OriginalName)(RBracket) (Hyphen) Descr; film.cxx 55. pretty.html 56. , ! 57. import "base.proto";import "facttypes_base.proto";message Film: NFactType.TFact{required string Title = 1;optional string Style = 2;optional string OriginalTitle = 3;} facttypes.proto 58. Genre -> Word interp (Film.Style);Film -> ;Descr -> Genre | Film;S -> Descr FilmName interp (Film.Title::not_norm)(LBracket) (OriginalName interp (Film.OriginalTitle))(RBracket);S -> FilmName interp (Film.Title::not_norm)(LBracket) (OriginalName interp (Film.OriginalTitle))(RBracket) (Hyphen) Descr; film.cxx 59. pretty.output 60. 61. 1. 2. + + : 62. = ", , " = "" = "brev" gram 63. import "base.proto";import "facttypes_base.proto";message Film: NFactType.TFact{required string Title = 1;optional string Style = 2;optional string OriginalTitle = 3;optional string DirectorFio = 4;} facttypes.proto 64. Name -> Word Word*;.Director -> Name interp (Film.DirectorFio);Director -> Comma Nameinterp (Film.DirectorFio);DescrDirector -> Descr (Director); // + S -> DescrDirector FilmName interp (Film.Name::not_norm)(LBracket) (OriginalName interp (Film.OriginalName)) (RBracket);S -> FilmName interp (Film.Name::not_norm) (LBracket)(OriginalName interp (Film.OriginalName)) (RBracket) (Hyphen)DescrDirector; film.cxx 65. pretty.html 66. 67. , , ., . 68. #encoding "utf-8"#GRAMMAR_ROOT SDate -> Word;Descr -> ;S -> Date Descr; date.cxx 69. TAuxDicArticle ""{key = { "tomita:film.cxx" type=CUSTOM }}TAuxDicArticle ""{key = { "tomita:date.cxx" type=CUSTOM }} mydic.gzt 70. optional string Year = 5;facttypes.proto 71. Date -> AnyWord;Director -> Name interp (Film.DirectorFio);Director -> Comma Name interp (Film.DirectorFio);DescrDirector -> Descr (Date interp (Film.Year)) (Director); film.cxx 72. pretty.output 73. http://api.yandex.ru/tomita/ : 74. http://download.cdn.yandex.net/tomita/tomita-ukr-linux64.bz2 - Linuxhttp://download.cdn.yandex.net/tomita/tomita-ukr-win32.zip - Windows- : 75. , ! 76. ?- 1 2 :- - - - - - - :- - - - 77. . 78. . 79. http://news.yandex.ru/people/el1tsin_boris.html 80. -: JAPE (Java Annotation Patterns Engine) GATE http://gate.ac.uk/AGFL (Affix Grammars Over a Finite Lattice) - http://www.agfl.cs.ru.nl/LSPL (LexicoSyntactic Pattern Language) , - http://www.lspl.ru/index.phpAIRE (Artificial Intelligence Information Retrieval Engine) http://clck.ru/4JKhe 81. ? 82. [email protected] ,