@article {5915,
	title = {Inferring Inflection Classes with Description Length},
	journal = {Journal of Language Modelling},
	volume = {5},
	year = {2017},
	pages = {465--525},
	doi = {http://dx.doi.org/10.15398/jlm.v5i3.184	},
	url = {http://jlm.ipipan.waw.pl/index.php/JLM/article/view/184/182},
	author = {Sacha Beniamine and Olivier Bonami and Beno{\^\i}t Sagot}
}
@conference {4813,
	title = {Information-theoretic inflectional classification},
	year = {2015},
	month = {07/2015},
	address = {Belgrade},
	author = {Sacha Beniamine and Olivier Bonami and Beno{\^\i}t Sagot}
}
@inproceedings {4444,
	title = {Developing a French FrameNet: Methodology and First results},
	year = {2014},
	publisher = {European Language Resources Association (ELRA)},
	address = {Reykjavik, Iceland},
	url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/496_Paper.pdf},
	author = {Marie Candito and Pascal Amsili and Lucie Barque and Farah Benamara and de Chalendar, Ga{\"e}l and Djemaa, Marianne and Pauline Haas and Richard Huyghe and Yvette Yannick Mathieu and Muller, Philippe and Beno{\^\i}t Sagot and Vieu, Laure}
}
@inproceedings {baranes:hal-01002723,
	title = {{A Language-Independent Approach to Extracting Derivational Relations from an Inflectional Lexicon}},
	year = {2014},
	address = {Reykjavik, Iceland},
	keywords = {Derivational Relation, Formal Analogy, Morphological Analysis},
	url = {https://hal.inria.fr/hal-01002723},
	author = {Baranes, Marion and Beno{\^\i}t Sagot}
}
@inproceedings {baranes:hal-01019998,
	title = {{Normalisation de textes par analogie: le cas des mots inconnus}},
	year = {2014},
	pages = {137-148},
	address = {Marseille, France},
	keywords = {Analogy, Spell checking, Text normalization},
	url = {https://hal.inria.fr/hal-01019998},
	author = {Baranes, Marion and Beno{\^\i}t Sagot}
}
@inproceedings {sagot:hal-01022351,
	title = {Sous-cat{\'e}gorisation en pour et syntaxe lexicale},
	year = {2014},
	month = {Jul},
	address = {Marseille, France},
	keywords = {Arguments en pour, Lexiques syntaxiques, Sous-cat{\'e}gorisation},
	url = {https://hal.inria.fr/hal-01022351},
	author = {Beno{\^\i}t Sagot and Danlos, Laurence and Margot Colinet}
}
@inproceedings {hanoka:hal-01022306,
	title = {{YaMTG: An Open-Source Heavily Multilingual Translation Graph Extracted from Wiktionaries and Parallel Corpora}},
	year = {2014},
	publisher = {{European Language Resources Association}},
	address = {Reykjavik, Iceland},
	url = {https://hal.inria.fr/hal-01022306},
	author = {Hanoka, Val{\'e}rie and Beno{\^\i}t Sagot}
}
@inproceedings {sagot:hal-00832078,
	title = {{Extension dynamique de lexiques morphologiques pour le fran{\c c}ais {\`a} partir d{\textquoteright}un flux textuel}},
	year = {2013},
	pages = {407-420},
	address = {Les sables d{\textquoteright}Olonne, France},
	keywords = {Dynamic Lexica, Morphological Analysis, Neologisms},
	url = {https://hal.inria.fr/hal-00832078},
	author = {Beno{\^\i}t Sagot and Nouvel, Damien and Mouilleron, Virginie and Baranes, Marion}
}
@inproceedings {5887,
	title = {Implementing a formal model of inflectional morphology},
	year = {2013},
	pages = {115-134},
	author = {Beno{\^\i}t Sagot and G{\'e}raldine Walther}
}
@inproceedings {sagot:hal-00699300,
	title = {{Aleda, a free large-scale entity database for French}},
	year = {2012},
	pages = {4 pages},
	address = {Istanbul, Turkey},
	keywords = {entity database, entity linking, named Entities},
	url = {https://hal.archives-ouvertes.fr/hal-00699300},
	author = {Beno{\^\i}t Sagot and Stern, Rosa}
}
@inproceedings {seddah:hal-00703124,
	title = {{The Alpage Architecture at the SANCL 2012 Shared Task: Robust Pre-Processing and Lexical Bridging for User-Generated Content Parsing}},
	year = {2012},
	month = {Jun},
	address = {Montr{\'e}al, Canada},
	url = {https://hal.inria.fr/hal-00703124},
	author = {Seddah, Djam{\'e} and Beno{\^\i}t Sagot and Marie Candito}
}
@inproceedings {sagot:hal-00703108,
	title = {{Annotation r{\'e}f{\'e}rentielle du Corpus Arbor{\'e} de Paris 7 en entit{\'e}s nomm{\'e}es}},
	volume = {2 - TALN},
	year = {2012},
	month = {Jun},
	address = {Grenoble, France},
	url = {https://hal.inria.fr/hal-00703108},
	author = {Beno{\^\i}t Sagot and Richard, Marion and Stern, Rosa},
	editor = {Georges Antoniadis}
}
@inproceedings {seddah:hal-00780898,
	title = {{Building a treebank of noisy user-generated content: The French Social Media Bank}},
	year = {2012},
	note = {Cet article constitue une version r{\'e}duite de l{\textquoteright}article {\textquoteright}{\textquoteright}The French Social Media Bank : a Treebank of Noisy User Generated Content{\textquoteright}{\textquoteright} (m{\^e}mes auteurs)},
	address = {Lisbonne, Portugal},
	keywords = {parsing, Social Media, Social Media., treebanking, User Generated Content},
	url = {https://hal.inria.fr/hal-00780898},
	author = {Seddah, Djam{\'e} and Beno{\^\i}t Sagot and Marie Candito and Mouilleron, Virginie and Combet, Vanessa}
}
@inproceedings {eckard:hal-00936500,
	title = {{Dictionary-Ontology Cross-Enrichment Using TLFi and WOLF to enrich one another}},
	year = {2012},
	publisher = {{Curran Associates, Inc}},
	address = {Mumbai, India},
	url = {https://hal.inria.fr/hal-00936500},
	author = {Eckard, Emmanuel and Lucie Barque and Nasr, Alexis and Beno{\^\i}t Sagot},
	editor = {Michael Zock and Reinhard Rapp}
}
@inproceedings {seddah:hal-00780895,
	title = {{The French Social Media Bank: a Treebank of Noisy User Generated Content}},
	year = {2012},
	month = {Dec},
	publisher = {{Kay, Martin and Boitet, Christian}},
	address = {Mumbai, India},
	url = {https://hal.inria.fr/hal-00780895},
	author = {Seddah, Djam{\'e} and Beno{\^\i}t Sagot and Marie Candito and Mouilleron, Virginie and Combet, Vanessa}
}
@inproceedings {stern:hal-00699295,
	title = {{A Joint Named Entity Recognition and Entity Linking System}},
	year = {2012},
	pages = {{\textendash}},
	address = {Avignon, France},
	keywords = {entity linking, statistical NER, symbolic NER},
	url = {https://hal.archives-ouvertes.fr/hal-00699295},
	author = {Stern, Rosa and Beno{\^\i}t Sagot and B{\'e}chet, Fr{\'e}d{\'e}ric}
}
@inproceedings {sagot:hal-00703128,
	title = {{Merging syntactic lexica: the case for French verbs}},
	year = {2012},
	month = {May},
	address = {Istanbul, Turkey},
	url = {https://hal.inria.fr/hal-00703128},
	author = {Beno{\^\i}t Sagot and Danlos, Laurence}
}
@inproceedings {stern:hal-00699297,
	title = {{Population of a Knowledge Base for News Metadata from Unstructured Text and Web Data}},
	year = {2012},
	month = {Jun},
	pages = {{\textendash}},
	address = {Montr{\'e}al, Canada},
	keywords = {entity linking, knowledge base population, web data extraction},
	url = {https://hal.archives-ouvertes.fr/hal-00699297},
	author = {Stern, Rosa and Beno{\^\i}t Sagot}
}
@inproceedings {hanoka:hal-00701606,
	title = {{Wordnet creation and extension made simple: A multilingual lexicon-based approach using wiki resources}},
	year = {2012},
	month = {May},
	pages = {6},
	address = {Istanbul, Turkey},
	keywords = {Wiki resources, Word Sense Disambiguation, WordNet},
	url = {https://hal.archives-ouvertes.fr/hal-00701606},
	author = {Hanoka, Val{\'e}rie and Beno{\^\i}t Sagot}
}
@inproceedings {975,
	title = {Construction d{\textquoteright}un lexique des adjectifs d{\'e}nominaux},
	year = {2011},
	pages = {69-74},
	keywords = {adjectifs d{\'e}nominaux, lexique d{\'e}rivationnel},
	author = {Jana Strnadov{\'a} and Beno{\^\i}t Sagot}
}
@inproceedings {sagot11perlextaln,
	title = {D{\'e}veloppement de ressources pour le persan: le nouveau lexique morphologique PerLex2 et l{\textquoteright}{\'e}tiqueteur morphosyntaxique MElt_fa},
	year = {2011},
	month = {06/2011},
	address = {Montpellier, France},
	abstract = {<p>\&nbsp;</p> <div class="column"> <p><span style="font-size: 14.000000pt; font-family: {\textquoteright}TeXGyreTermes-Bold-Identity-H{\textquoteright}">R{\'e}sum{\'e}. </span><span style="font-size: 10.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">Nous pr{\'e}sentons une nouvelle version de PerLex, lexique morphologique du persan, une version corrig{\'e}e et partiellement r{\'e}annot{\'e}e du corpus {\'e}tiquet{\'e} BijanKhan (BijanKhan, 2004) et MEltfa, un nouvel {\'e}tique- teur morphosyntaxique librement disponible pour le persan. Apr{\`e}s avoir d{\'e}velopp{\'e} une premi{\`e}re version de PerLex (Sagot \&amp; Walther, 2010), nous en proposons donc ici une version am{\'e}lior{\'e}e. Outre une validation manuelle par- tielle, PerLex 2 repose d{\'e}sormais sur un inventaire de cat{\'e}gories linguistiquement motiv{\'e}. Nous avons {\'e}galement d{\'e}velopp{\'e} une nouvelle version du corpus BijanKhan : elle contient des corrections significatives de la tokenisation ainsi qu\&$\#$39;un r{\'e}{\'e}tiquetage {\`a} l\&$\#$39;aide des nouvelles cat{\'e}gories. Cette nouvelle version du corpus a enfin {\'e}t{\'e} utilis{\'e}e pour l\&$\#$39;entra{\^{\i}nement de MEltfa, notre {\'e}tiqueteur morphosyntaxique pour le persan librement disponible, s\&$\#$39;appuyant {\`a} la fois sur ce nouvel inventaire de cat{\'e}gories, sur PerLex 2 et sur le syst{\`e}me d\&$\#$39;{\'e}tiquetage MElt (Denis \&amp; Sagot, 2009). </span></p> <p><span style="font-size: 14.000000pt; font-family: {\textquoteright}TeXGyreTermes-Bold-Identity-H{\textquoteright}">Abstract. </span><span style="font-size: 10.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">We present a new version of PerLex, the morphological lexicon for the Persian language, a cor- rected and partially re-annotated version of the BijanKhan corpus (BijanKhan, 2004) and MEltfa, a new freely available POS-tagger for the Persian language. After PerLex\&$\#$39;s first version (Sagot \&amp; Walther, 2010), we propose an improved version of our morphological lexicon. Apart from a partial manual validation, PerLex 2 now relies on a set of linguistically motivated POS. Based on these POS, we also developped a new version of the BijanKhan corpus with significant corrections of the tokenisation. It has been re-tagged according to the new set of POS. The new version of the BijanKhan corpus has been used to develop MEltfa, our new freely-available POS-tagger for the Persian language, based on the new POS set, PerLex 2 and the MElt tagging system (Denis \&amp; Sagot, 2009).\&nbsp;</span></p> </div> <p>\&nbsp;</p>},
	keywords = {cat{\'e}gories, {\'e}tiqueteur morphosyntaxique, Lexical resource, MElt, MElt., PerLex, persan, Persian, POS, Ressource lexicale, tagger, validation},
	author = {Beno{\^\i}t Sagot and G{\'e}raldine Walther and Pegah Faghiri and Samvelian, Pollet}
}
@article {walther11tal,
	title = {Mod{\'e}lisation et impl{\'e}mentation de ph{\'e}nom{\`e}nes non-canoniques},
	journal = {Revue TAL},
	volume = {52},
	number = {2/2011},
	year = {2011},
	note = {<p>Vers la morphologie et au-del\&agrave;.</p>},
	month = {12/2011},
	pages = {91-122},
	chapter = {91},
	abstract = {<p>\&nbsp;</p> <div class="column"> <p><span style="font-size: 7.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">R\&Eacute;SUM\&Eacute;. </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">Les ph\&eacute;nom\&egrave;nes flexionnels non canoniques (d\&eacute;ponence, h\&eacute;t\&eacute;roclise. . . ) font l\&rsquo;objet de nombreux travaux en morphologie th\&eacute;orique. Toutefois, ces travaux manquent souvent d\&rsquo;im- pl\&eacute;mentations associ\&eacute;es \&agrave; des lexiques \&agrave; grande \&eacute;chelle, pourtant n\&eacute;cessaires pour comparer objectivement la complexit\&eacute; de descriptions morphologiques. Nous montrons comment </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}SanRemo{\textquoteright}">parsli, </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">notre mod\&egrave;le de la morphologie flexionnelle, permet de repr\&eacute;senter ces ph\&eacute;nom\&egrave;nes non cano- niques et de les formaliser en vue d\&rsquo;une impl\&eacute;mentation. Nous l\&rsquo;illustrons au moyen de don- n\&eacute;es de langues vari\&eacute;es. Nous \&eacute;valuons la complexit\&eacute; de quatre mod\&eacute;lisations morphologiques concurrentes pour les verbes du fran\&ccedil;ais gr\&acirc;ce \&agrave; la notion informationnelle de </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}">longueur de description </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">et montrons que les concepts nouveaux de </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}SanRemo{\textquoteright}">parsli </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">r\&eacute;duisent la complexit\&eacute; des mo- d\&eacute;lisations morphologiques par rapport \&agrave; des mod\&egrave;les traditionnels ou plus r\&eacute;cents. </span></p> <p><span style="font-size: 7.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">ABSTRACT. </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">Non-canonical inflection (deponency, heteroclisis. . . ) is extensively studied in the- oretical morphology. However, these studies often lack practical implementations associated with large-scale lexica. Yet these are precisely the requirements for objective comparative stud- ies on the complexity of morphological descriptions. We show how </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}SanRemo{\textquoteright}">parsli, </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">our model of in- flectional morphology, manages to represent many non-canonical phenomena and to formalise them in way allowing for their subsequent implementation. We illustrate it with data about a variety of languages. We expose experiments conducted on the complexity of four compet- ing descriptions of French verbal inflection, which is evaluated using the information-theoretic concept of </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}">description length. </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">We show that the new concepts introduced in </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}SanRemo{\textquoteright}">parsli </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}; font-style: italic">reduce the complexity of morphological descriptions w.r.t. both traditional or more recent models.\&nbsp;</span></p> </div> <p>\&nbsp;</p>},
	keywords = {Canonicity, Description Complexity, Inflection Pattern, Inflection Zone, Inflectional Morphology, MDL., Paradigm Shape, parsli, Stem Pattern, Stem Zone},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/tal11morpho.pdf},
	author = {G{\'e}raldine Walther and Beno{\^\i}t Sagot}
}
@inproceedings {sagot11icil,
	title = {A new morphological lexicon and a POS tagger for the Persian Language},
	year = {2011},
	note = {<p>Communication \&agrave; la 4eme \&eacute;dition de la International Conference on Iranian Linguistics (ICIL4). 17-19 juin 2011. Uppsala, Su\&egrave;de</p>},
	month = {06/2011},
	address = {Uppsala, Su{\`e}de},
	abstract = {<p>\&nbsp;</p> <div class="column"> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">In (Sagot and Walther, 2010), the authors introduce an advanced tokenizer and a morpho- logical lexicon for the Persian language named PerLex. In this paper, we describe experiments dedicated to enriching this lexicon and using it for building a POS tagger for Persian. </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">Natural Language Processing (NLP) tasks such as part-of-speech (POS) tagging or pars- ing as well as most NLP applications require large-scale lexical resources. Yet, such resources rarely are freely available, even though it is the fastest way to building high-quality resources. In this paper, we introduce a new version of the large-scale and freely available morpholog- ical lexicon for Persian named PerLex, which relies on a new linguistically motivated POS inventory as well as several validation steps; we show how we used this new lexicon for gen- erating an improved version of the BijanKhan corpus (BijanKhan, 2004) and training the MElt tagging system (Denis and Sagot, 2009), thus creating a freely available Persian tagger. </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">The first important NLP project on Persian is the Shiraz project, targeted towards Persian to English automatic translation (Amtrup </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">et al., </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">2000). Among other things, it produced 50,000 terms bilingual lexicon (which however does not seem to be freely available) based in part on a unification-based description of the Persian morphology (Megerdoomian, 2000). Apart from the Shiraz project, some other NLP tools such as morphological tools and lemmatisers have been developed, although not associated with a full large scale lexicon (cf. the freely available lemmatizer PerStem (Dehdari and Lonsdale, 2008)). To our best knowledge, the only freely available large-coverage lexical resources for Persian are the above-mentioned PerLex lexicon (Sagot and Walther, 2010) and the Persian lexicon within MULTEXT-East version 4 (Erjavec, 2010; QasemiZadeh and Rahimi, 2006). Other recent work on the development of NLP tools and resources for Persian processing is mostly focused on designing part-of-speech taggers (QasemiZadeh and Rahimi, 2006; Shamsfard and Fadaee, 2008), parsers (Dehdari and Lonsdale, 2008) or automatic translation systems. </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Bold-Identity-H{\textquoteright}">Improving PerLex </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">The PerLex 1 lexicon (Sagot and Walther, 2010) contained approx. 36,000 lexical entries (lemmas) corresponding to over 520,000 (inflected) form entries describing ap- prox. 500,000 unique forms. Apart from its underlying morphological description, PerLex 1 had mainly been built automatically using automatic lexical data aquisition techniques such as the extraction of lexical entries from the automatically tagged BijanKhan corpus (BijanKhan, 2004) and from Wikipedia. Therefore, the first step towards the construction of a new version, PerLex 2, was to improve the quality of the lexicon by validating the entries extracted from the BijanKhan corpus. We first automatically (pre-)validated a certain amount of entries, us- ing comparison and/or fusion of PerLex with other lexical resources (i.e. the Persian lexicon included in version 4 of MULTEXT-East (henceforth MTE4-fa) (QasemiZadeh and Rahimi, 2006; Erjavec, 2010) and the Persian Pronunciation Dictionary (henceforth PPD) (Deyhime, 2000). Being not freely distributable, we didn\&$\#$39;t use the PPD to provide us with additional entries, but only to pre-validate existing lexical entries, in particular those for which most inflected forms are found in the PPD. On the other hand, MTE4-fa is a freely available and </span></p> </div> <div class="column"> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">redistributable morphological lexicon including 13,006lexical entries. We established a map- ping between POS tags found in MTE4-fa and in PerLex, converted MTE4-fa in same format as PerLex and merged it with PerLex. The entries resulting from merging entries from both resources were considered pre-validated. Entries corresponding only to MTE4-fa entries were added to PerLex (in many cases, this required to add the appropriate inflection class manually). </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">Entries automatically pre-validated were excluded from the manual validation (apart for nouns and adjectives) hence avoiding unecessary manual validation costs. So far, we have carried out two seperate manual validation campaigns using a dedicated online validation in- terface that aims at optimizing validation speed (for example, lexical entries are displayed as a canonical form and the minimal set of inflected forms whose correctness guarantees that the entry\&$\#$39;s inflection class is correct; another example is that the interface allows for specifying most types of inflection class assignment errors (e.g., a lemma ending in </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TimesNewRomanPSMT{\textquoteright}"> ی</span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">yeh </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">pronounced [i] but considered as if it was pronounced [j]). The first validation campaign created 751vali- dation tickets 451(correct entries, 250correct POS but invalid inflected forms, no invalid POS and 50completely invalid entries, mostly due to encoding bugs we resolved in the meantime). The second validation campaign created 1,097validation tickets 818(correct entries, 17valid POS but invalid inflected forms, 26invalid categories ans 129completely invalid entries, mostly inflected pronominal forms erroneously considered as individual lexical entries). </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">Another new feature of PerLex 2is its new sound set of POS. PerLex 1had simply adopted the POS used in the BijanKhan corpus (BijanKhan, ;2004Amiri </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">et al., </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}"> .)2007We decided to convert the lexicon into a new set of linguistically motivated POS (Faghiri \&amp;Samvelian, in prep.): nouns, proper nouns, adjectives, adverbs; verbs, prepositions, conjunctions, classifiers, pronouns, determiners and interjections. The conversion has been realised through automatic conversion techniques. It was straightforward for nouns (N), verbs (V), proper nouns (PN), pronouns (PRO), interjections (INT), delimiters (DELM). For the other POS, precise criteria had to be established manually to re-assign their members. The POS </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}LMMono12-Regular-Identity-H{\textquoteright}">MORP </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">of the BijanKhan corpus has been altogether suppressed since it contained elements contributing to word-formation in various ways but not considered words in the description we adopted. On the other hand, we established a new POS tag for classifiers (CLASS) which replaces the old specifier-tag </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}LMMono12-Regular-Identity-H{\textquoteright}">SPEC. </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">The size of PerLex 2is similar to that of PerLex 1(suppressing erroneous entries has quantitatively counter-balanced the addition of new entries and the conversion into a new POS set does not result in quantitative differences), yet it is the qualitative improvement, such as the addition of new inflection tables for auxiliairies and light verbs, that characterises PerLex .2 </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Bold-Identity-H{\textquoteright}">Corpus modification </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">The next step of our work was to develop a new tagger for Persian based on our POS inventory and on PerLex ,2using the MElt tagging system (Denis and Sagot, .)2009We first designed a tagset that is a refinment of this inventory. Our tagset defines 79 tags, among which 37verbal tags, 9pronominal tags and 8nominal tags. </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">For training the MElt system, we decided to create a new version of the BijanKhan corpus. This new version differs in two ways: first, we improved the original automatic tokenization and annotation of the corpus. Second, we converted the corpus so that it uses our tagset. We started from the version of the corpus used in (Sagot and Walther, ,)2010which is already segmented in 88,885sentences. We applied rule-based transformations for correcting sys- tematic tokenization and/or annotation errors. These include among others various kinds of typographic (e.g., whitespace) inconsistencies (verbal prefixes, nominal suffixes, acronyms, compound prepositions, removal of the </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}LMMono12-Regular-Identity-H{\textquoteright}">MORP </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">category, and others), whose correction require modifications in the annotation itself. We also corrected systematic annotation errors. Next, we needed to convert the corpus annotations into our 79-tag tagset. In order to achieve a good level </span></p> </div> <div class="column"> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">of quality, we decided to convert mostly the annotation of those tokens for which we could find a unique tag from our tagset that was consistent with both the corrected corpus annotation and lexical information in PerLex 2. However, in rare cases, heuristics allowed us to choose among various possible tags, as well as to convert annotations for tokens unknown to PerLex (e.g., by relying on morphology-based patterns). The resulting modified BijanKhan corpus was then split in 3 parts. The last 100 sentences (1,568 of their 1,707 tokens could be con- verted) were extracted and the annotations manually converted (when needed) or corrected, leading to a </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">gold standard. </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">Among the remaining sentences, those for which all tokens had been successfully converted constitute a 18,731-sentence </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">training corpus </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">(302,690 tokens). </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Bold-Identity-H{\textquoteright}">Tagging Persian with MEltfa </span><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">Next, we extracted from PerLex 2 a lexicon based on our 79-tag tagset. Together with the above-described (far from error-free) training corpus, this al- lowed us to train the MElt system and generate a tagger for Persian, MEltfa. W.r.t. our gold standard, MEltfa has a 90.3\% accuracy on the full tagset, and a 93.3\% accuracy if we project this tagset on our 14 POS inventory. Evaluated only on the 1,568 tokens for which the anno- tations could be converted automatically, these figures reach respectively 93.9\% and 95.3\%. These figures are probably a lower bound on the accuracy we would reach if all annotations were converted successfully. Indeed, non-converted tokens have not been converted in the training data either: MElt has not learned any contextual information about them, hence more errors on these tokens (this in turn might affect MEltfa\&$\#$39;s decisions on surrounding tokens). </span></p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}TeXGyreTermes-Regular-Identity-H{\textquoteright}">We compared the quality of MEltfa\&$\#$39;s annotations to those resulting from our automatic conversion process. It turns out that the accuracy of these annotations on those 1,568 tokens for which the automatic conversion was successful is exactly the same (93.9\% and 95.3\%) as that of MEltfa, although only 48\% concern the same tokens. In other words, on these 1,568 tokens, MEltfa was able to produce annotations whose quality is the same as the quality of its training corpus, which in turn is higher than that of the original BijanKhan corpus. We believe that this is related both to the use of PerLex as a source of information and to the fact that MEltfa\&$\#$39;s probabilistic model smoothes many errors in its training corpus (with a {\textquoteleft}{\textquoteleft}co- training\&$\#$39;\&$\#$39;-like effect). This latter hypothesis is confirmed by the fact that, among these 1,568 tokens, MEltfa\&$\#$39;s result are slightly closer to the gold standard (93,9\% accuracy on the full tagset) than to its automatically converted version before manual correction (93.4\%).\&nbsp;</span></p> </div> <p>\&nbsp;</p>},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/icil11pergram.pdf},
	author = {Beno{\^\i}t Sagot and G{\'e}raldine Walther and Pegah Faghiri and Samvelian, Pollet}
}
@inproceedings {sagot11sfcm,
	title = {Non-canonical inflection : data, formalisation and complexity measures.},
	volume = {100},
	year = {2011},
	note = {<p>Systems and Frameworks for Computational Morphology</p>},
	month = {August 2011},
	pages = {23-45},
	publisher = {Springer},
	chapter = {23},
	abstract = {<p>\&nbsp;</p> <p><span style="font-size: 9.000000pt; font-family: {\textquoteright}CMR9{\textquoteright}">Non-canonical inflection (suppletion, deponency, heterocli- sis...) is extensively studied in theoretical approaches to morphology. However, these studies often lack practical implementations associated with large-scale lexica. Yet these are precisely the requirements for ob- jective comparative studies on the complexity of morphological descrip- tions. We show how a model of inflectional morphology which can rep- resent many non-canonical phenomena [67], as well as a formalisation and an implementation thereof can be used to evaluate the complexity of competing morphological descriptions. After illustrating the proper- ties of the model with data about French, Latin, Italian, Persian and Sorani Kurdish verbs and about noun classes from Croatian and Slovak we expose experiments conducted on the complexity of four competing descriptions of French verbal inflection. The complexity is evaluated us- ing the information-theoretic concept of </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}CMTI9{\textquoteright}">description length. </span><span style="font-size: 9.000000pt; font-family: {\textquoteright}CMR9{\textquoteright}">We show that the new concepts introduced in the model by [67] enable reducing the complexity of morphological descriptions w.r.t. both traditional or more recent models.\&nbsp;</span></p>},
	keywords = {Canonicity, Description Complexity, Inflection Pat- tern, Inflection Zone, Inflectional Morphology, MDL, Paradigm Shape, Stem Pattern., Stem Zone},
	isbn = {978-3-642-23137-7},
	issn = {1865-0929},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/sfcm11-updated.pdf},
	author = {Beno{\^\i}t Sagot and G{\'e}raldine Walther}
}
@inproceedings {walther11lgc,
	title = {Probl{\`e}mes d{\textquoteright}int{\'e}gration morphologique d{\textquoteright}emprunts d{\textquoteright}origine anglaise en fran{\c c}ais},
	year = {2011},
	month = {10/2011},
	address = {Nicosie, Chypre},
	abstract = {<p>\&nbsp;</p> <div class="column"> <p><span style="font-size: 10.000000pt; font-family: {\textquoteright}TimesNewRomanPSMT{\textquoteright}">Nous proposons une \&eacute;tude morphologique de l\&rsquo;emprunt, notamment verbal et nominal, d\&rsquo;origine anglaise en fran\&ccedil;ais. \&Agrave; partir de donn\&eacute;es extraites d\&rsquo;un corpus volumineux, nous \&eacute;tudions les proc\&eacute;d\&eacute;s morphologiques d\&rsquo;int\&eacute;gration des nouvelles unit\&eacute;s lexicales (sous leur forme graph\&eacute;mique) et les probl\&egrave;mes qu\&rsquo;ils posent notamment en termes d\&rsquo;instabilit\&eacute; orthographique ou de m\&eacute;canismes d\&eacute;rivationnels. Cette \&eacute;tude fournit ainsi une premi\&egrave;re approche th\&eacute;orique du ph\&eacute;nom\&egrave;ne morphologique de l\&rsquo;emprunt. Elle devra ensuite servir de support th\&eacute;orique \&agrave; un traitement automatique des emprunts. </span></p> <p>\&nbsp;</p> </div> <p>\&nbsp;</p>},
	keywords = {Emprunt, Int{\'e}gration morphologique et lexicale, Lexique, Morphologie, N{\'e}ologie},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/clg11neo-final.pdf},
	author = {G{\'e}raldine Walther and Beno{\^\i}t Sagot}
}
@inproceedings {walther10soralex,
	title = {Developing a Large-Scale Lexicon for a Less-Resourced Language: General Methodology and Preliminary Experiments on Sorani Kurdish},
	year = {2010},
	address = {Valetta, Malta},
	abstract = {<p>\&nbsp;</p> <p><span style="font-size: 9.000000pt; font-family: {\textquoteright}Times{\textquoteright}">In this paper, we describe a general methodology for developing a large-scale lexicon for a less-resourced language, i.e., a language for which raw internet-based corpora and general-purpose grammars are virtually the only existing resources. We apply this methodology to the development of a morphological lexicon for Sorani Kurdish, an Iranian language mostly spoken in northern Iraq and north-western Iran. Although preliminary, our results demonstrate the relevance of this methodology.\&nbsp;</span></p>},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/saltmil10soralex.pdf},
	author = {G{\'e}raldine Walther and Beno{\^\i}t Sagot}
}
@inproceedings {sagot10perlextaln,
	title = {D{\'e}veloppement de ressources pour le persan: lexique morphologique et cha{\^{\i}ne de traitements de surface},
	year = {2010},
	address = {Montr{\'e}al, Canada},
	abstract = {<p>\&nbsp;</p> <p><span style="font-size: 12.000000pt; font-family: {\textquoteright}NimbusRomNo9L{\textquoteright}">Nous pr\&eacute;sentons PerLex, un lexique morphologique du persan \&agrave; large couverture et libre- ment disponible, accompagn\&eacute; d\&rsquo;une cha\&icirc;ne de traitements de surface pour cette langue. Nous d\&eacute;crivons quelques caract\&eacute;ristiques de la morphologie du persan, et la fa\&ccedil;on dont nous l\&rsquo;avons repr\&eacute;sent\&eacute;e dans le formalisme lexical Alexina, sur lequel repose PerLex. Nous insistons sur la m\&eacute;thodologie que nous avons employ\&eacute;e pour construire les entr\&eacute;es lexicales \&agrave; partir de diverses sources, ainsi que sur les probl\&egrave;mes li\&eacute;s \&agrave; la normalisation typographique. Le lexique obtenu a une couverture satisfaisante sur un corpus de r\&eacute;f\&eacute;- rence, et devrait donc constituer un bon point de d\&eacute;part pour le d\&eacute;veloppement d\&rsquo;un lexique syntaxique du persan.\&nbsp;</span></p>},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/taln2010perlex.pdf},
	author = {Beno{\^\i}t Sagot and G{\'e}raldine Walther}
}
@inproceedings {walther10kurlex,
	title = {Fast Development of Basic NLP Tools: Towards a Lexicon and a POS Tagger for Kurmanji Kurdish},
	year = {2010},
	address = {Belgrad, Serbia},
	abstract = {<p>\&nbsp;</p> <p><span>The development of basic NLP resources for minority languages is still a challenge to both formal and compu- tational linguists. In this paper, we show how we were able to develop a medium-scale morphological lexicon for Kurmanji Kurdish in a few days time using only freely accessible resources. We also developed a preliminary POS tagger that shall be used as a pre-annotation tool for developing a POS-annotated corpus, based solely on raw text and on our morphological lexicon.\&nbsp;</span></p>},
	keywords = {C-COM},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/clg10kmr.pdf},
	author = {G{\'e}raldine Walther and Beno{\^\i}t Sagot and Kar{\"e}n Fort}
}
@booklet {1036,
	title = {KurLex},
	year = {2010},
	keywords = {Alexina, Kurmanji Kurdish, Lexical resource},
	url = {http://alexina.gforge.inria.fr/},
	author = {G{\'e}raldine Walther and Beno{\^\i}t Sagot}
}
@unpublished {sagot10perlex,
	title = {A morphological lexicon for the Persian language},
	year = {2010},
	address = {La Valette, Malte},
	abstract = {<p>\&nbsp;</p> <p><span style="font-size: 9.000000pt; font-family: {\textquoteright}Times{\textquoteright}">We introduce PerLex, a large-coverage and freely-available morphological lexicon for the Persian language. We describe the main features of the Persian morphology, and the way we have represented it within the Alexina formalism, on which PerLex is based. We focus on the methodology we used for constructing lexical entries from various sources, as well as the problems related to typographic normalisation. The resulting lexicon shows a satisfying coverage on a reference corpus and should therefore be a good starting point for developing a syntactic lexicon for the Persian language.\&nbsp;</span></p>},
	url = {http://web.me.com/gwalther/homepage/Publications_(fr)_files/lrec10perlex-poster.pdf},
	author = {Beno{\^\i}t Sagot and G{\'e}raldine Walther}
}
@booklet {1034,
	title = {PerLex},
	year = {2010},
	keywords = {Alexina, Lexical resource, Persian},
	url = {http://alexina.gforge.inria.fr/},
	author = {Beno{\^\i}t Sagot and G{\'e}raldine Walther}
}
@booklet {1035,
	title = {SoraLex},
	year = {2010},
	keywords = {Alexina, Lexical resource, Sorani Kurdish},
	url = {http://alexina.gforge.inria.fr/},
	author = {G{\'e}raldine Walther and Beno{\^\i}t Sagot}
}