Skip to content

Commit

Permalink
feat: added ukrainian vocab (#1700)
Browse files Browse the repository at this point in the history
  • Loading branch information
holyCowMp3 authored Aug 21, 2024
1 parent d7f4533 commit 06bce51
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/source/modules/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ of vocabs.
* - arabic_letters
- 37
- ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي
* - generic_cyrillic_letters
- 58
- абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ
* - persian_letters
- 5
- پچڢڤگ
Expand Down Expand Up @@ -151,6 +154,9 @@ of vocabs.
* - swedish
- 106
- 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿åäöÅÄÖ
* - ukrainian
- 115
- абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴
* - vietnamese
- 236
- 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ
Expand Down
2 changes: 2 additions & 0 deletions doctr/datasets/vocabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"hindi_punctuation": "।,?!:्ॐ॰॥॰",
"bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
"bangla_digits": "০১২৩৪৫৬৭৮৯",
"generic_cyrillic_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ",
}

VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
Expand Down Expand Up @@ -59,6 +60,7 @@
VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
VOCABS["ukrainian"] = VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ₴"
VOCABS["multilingual"] = "".join(
dict.fromkeys(
VOCABS["french"]
Expand Down

0 comments on commit 06bce51

Please sign in to comment.