phatpiglet · SubrataSarkar32 · Jan 10, 2019 · Jan 12, 2019 · Jan 14, 2019 · Jan 14, 2019
diff --git a/README.rst b/README.rst
@@ -1,13 +1,18 @@
 ===========
 autocorrect
 ===========
+
 Python 3 Spelling Corrector
+Supports English and Bengali
+=======
+Python 3 Spelling Corrector for bengali
+
 
 Installation
 ============
 .. code-block:: bash
 
-    pip install autocorrect
+    pip install https://github.com/SubrataSarkar32/autocorrect.git
 
 Examples
 ========
@@ -16,6 +21,8 @@ Examples
     >>> from autocorrect import spell
     >>> spell('HTe')
     'The'
+    #for bengali
+    >>> spell('কখন',language='bn')
 
 Contribute
 ==========

diff --git a/autocorrect.egg-info/PKG-INFO b/autocorrect.egg-info/PKG-INFO
@@ -0,0 +1,16 @@
+Metadata-Version: 2.1
+Name: autocorrect
+Version: 0.3.0
+Summary: Python 3 Spelling Corrector
+Home-page: https://github.com/phatpiglet/autocorrect/
+Author: Jonas McCallum
+Author-email: jonasmccallum@gmail.com
+License: http://www.opensource.org/licenses/mit-license.php
+Keywords: autocorrect spelling corrector
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+License-File: LICENSE
diff --git a/autocorrect.egg-info/SOURCES.txt b/autocorrect.egg-info/SOURCES.txt
@@ -0,0 +1,15 @@
+LICENSE
+README.rst
+setup.py
+autocorrect/__init__.py
+autocorrect/nlp_parser.py
+autocorrect/nlp_parser_bn.py
+autocorrect/utils.py
+autocorrect/word.py
+autocorrect/word_lists.py
+autocorrect/word_lists_bn.py
+autocorrect/words.bz2
+autocorrect.egg-info/PKG-INFO
+autocorrect.egg-info/SOURCES.txt
+autocorrect.egg-info/dependency_links.txt
+autocorrect.egg-info/top_level.txt
diff --git a/autocorrect.egg-info/dependency_links.txt b/autocorrect.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/autocorrect.egg-info/top_level.txt b/autocorrect.egg-info/top_level.txt
@@ -0,0 +1 @@
+autocorrect
diff --git a/autocorrect/__init__.py b/autocorrect/__init__.py
@@ -8,19 +8,32 @@
 # http://www.opensource.org/licenses/mit-license.php
 """
 Spell function
-
+Modified by Subrata Sarkar
+https://github.com/SubrataSarkar32
 Author: Jonas McCallum
 https://github.com/foobarmus/autocorrect
 
 """
 from autocorrect.nlp_parser import NLP_COUNTS
+from autocorrect.nlp_parser_bn import NLP_COUNTS as NLP_COUNTS_BN
 from autocorrect.word import Word, common, exact, known, get_case
 
-def spell(word):
-    """most likely correction for everything up to a double typo"""
-    w = Word(word)
-    candidates = (common([word]) or exact([word]) or known([word]) or
-                  known(w.typos()) or common(w.double_typos()) or
-                  [word])
-    correction = max(candidates, key=NLP_COUNTS.get)
-    return get_case(word, correction)
+def spell(word,language='en'):
+    """The language parameter takes into account of the language.
+       most likely correction for everything up to a double typo"""
+    if(language == 'en'):
+        w = Word(word)
+        candidates = (common([word]) or exact([word]) or known([word]) or
+                      known(w.typos()) or common(w.double_typos()) or
+                      [word])
+        correction = max(candidates, key=NLP_COUNTS.get)
+        return get_case(word, correction)
+    elif(language == 'bn'):
+        w = Word(word)
+        candidates = (common([word]) or exact([word]) or known([word]) or
+                      known(w.typos()) or common(w.double_typos()) or
+                      [word])
+        correction = max(candidates, key=NLP_COUNTS_BN.get)
+        return get_case(word, correction)
+    else:
+        raise ValueError("This language is not supported")
diff --git a/autocorrect/__pycache__/__init__.cpython-36.pyc b/autocorrect/__pycache__/__init__.cpython-36.pyc
diff --git a/autocorrect/__pycache__/nlp_parser.cpython-36.pyc b/autocorrect/__pycache__/nlp_parser.cpython-36.pyc
diff --git a/autocorrect/__pycache__/utils.cpython-36.pyc b/autocorrect/__pycache__/utils.cpython-36.pyc
diff --git a/autocorrect/__pycache__/word.cpython-36.pyc b/autocorrect/__pycache__/word.cpython-36.pyc
diff --git a/autocorrect/__pycache__/word_lists.cpython-36.pyc b/autocorrect/__pycache__/word_lists.cpython-36.pyc
diff --git a/autocorrect/nlp_parser.py b/autocorrect/nlp_parser.py
@@ -13,7 +13,7 @@
 https://github.com/foobarmus/autocorrect
 
 """
-from autocorrect.utils import words_from_archive, zero_default_dict
+from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1
 
 def parse(lang_sample):
     """tally word popularity using novel extracts, etc"""
@@ -23,4 +23,14 @@ def parse(lang_sample):
         counts[word] += 1
     return set(words), counts
 
+def parse1(lang_sample):
+    """tally word popularity using novel extracts, etc"""
+    words = words_from_archive1(lang_sample, include_dups=True)
+    counts = zero_default_dict()
+    for word in words:
+        counts[word] += 1
+    return set(words), counts
+
 NLP_WORDS, NLP_COUNTS = parse('big.txt')
+#NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt')
+#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1
diff --git a/autocorrect/nlp_parser_bn.py b/autocorrect/nlp_parser_bn.py
@@ -0,0 +1,36 @@
+# Python 3 Spelling Corrector
+#
+# Copyright 2014 Jonas McCallum.
+# Updated for Python 3, based on Peter Norvig's
+# 2007 version: http://norvig.com/spell-correct.html
+#
+# Open source, MIT license
+# http://www.opensource.org/licenses/mit-license.php
+"""
+NLP parser
+
+Author: Jonas McCallum
+https://github.com/foobarmus/autocorrect
+
+"""
+from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1
+
+def parse(lang_sample):
+    """tally word popularity using novel extracts, etc"""
+    words = words_from_archive(lang_sample, include_dups=True)
+    counts = zero_default_dict()
+    for word in words:
+        counts[word] += 1
+    return set(words), counts
+
+def parse1(lang_sample):
+    """tally word popularity using novel extracts, etc"""
+    words = words_from_archive1(lang_sample, include_dups=True)
+    counts = zero_default_dict()
+    for word in words:
+        counts[word] += 1
+    return set(words), counts
+
+#NLP_WORDS_1, NLP_COUNTS_1 = parse('big.txt')
+NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt')
+#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1
diff --git a/autocorrect/utils.py b/autocorrect/utils.py
@@ -13,14 +13,42 @@
 https://github.com/foobarmus/autocorrect
 
 """
-import re, os, tarfile
+import re, os, tarfile, io
 from contextlib import closing
 from itertools import chain
 
 PATH = os.path.abspath(os.path.dirname(__file__))
 BZ2 = 'words.bz2'
 RE = '[A-Za-z]+'
+RE1 = '[\w\s]+'
 
+def words_from_file(filename, include_dups=False, map_case=False):
+    filepath=os.path.join(PATH, filename)
+    with io.open(filepath,'r',encoding='utf8') as f:
+        text = f.read()
+        words = text.split()
+    if include_dups:
+        return words
+    elif map_case:
+        return words
+    else:
+        return set(words)
+
+def words_from_archive1(filename, include_dups=False, map_case=False):
+    """extract words from a text file in the archive"""
+    bz2 = os.path.join(PATH, BZ2)
+    tar_path = '{}/{}'.format('words', filename)
+    with closing(tarfile.open(bz2, 'r:bz2')) as t:
+        with closing(t.extractfile(tar_path)) as f:
+            words = re.findall(RE1, f.read().decode(encoding='utf-8'))
+    if include_dups:
+        return words
+    elif map_case:
+        return {w.lower():w for w in words}
+    else:
+        return set(words)
+
+
 def words_from_archive(filename, include_dups=False, map_case=False):
     """extract words from a text file in the archive"""
     bz2 = os.path.join(PATH, BZ2)
@@ -44,7 +72,7 @@ def concat(*args):
 
 class Zero(dict):
     """dict with a zero default"""
-
+    
     def __getitem__(self, key):
         return self.get(key)
 

diff --git a/autocorrect/word.py b/autocorrect/word.py
@@ -18,7 +18,7 @@
 from autocorrect.word_lists import LOWERCASE, MIXED_CASE
 from autocorrect.word_lists import LOWERED, CASE_MAPPED
 
-ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
+ALPHABET = '''abcdefghijklmnopqrstuvwxyzঅআইঈউঊঋঌএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভময়সশষহযরড়ঢ়লৎঁংঃ্৷ািীুূৃেৈোৌ'''
 KNOWN_WORDS = LOWERCASE | LOWERED | NLP_WORDS
 
 class Word(object):

diff --git a/autocorrect/word_lists.py b/autocorrect/word_lists.py
@@ -13,7 +13,7 @@
 https://github.com/foobarmus/autocorrect
 
 """
-from autocorrect.utils import words_from_archive
+from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1
 
 # en_US_GB_CA is a superset of US, GB and CA
 # spellings (color, colour, etc). It contains
@@ -29,10 +29,11 @@
 # Colombo (mixed)
 
 LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt')
+#LOWERCASE = words_from_archive1('bdict4.txt')
 # {'we', 'flew', 'to', 'via'}
-
-CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt',
-                                 map_case=True)
+#just add the list of words of the language which you wish to add with lowercase (if its devnagari type  only this will suffice)
+CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True)
+CASE_MAPPED = {}
 #  {abu': 'Abu',
 #  'dhabi': 'Dhabi',
 #  'colombo': 'Colombo'}

diff --git a/autocorrect/word_lists_bn.py b/autocorrect/word_lists_bn.py
@@ -0,0 +1,52 @@
+# Python 3 Spelling Corrector
+#
+# Copyright 2014 Jonas McCallum.
+# Updated for Python 3, based on Peter Norvig's
+# 2007 version: http://norvig.com/spell-correct.html
+#
+# Open source, MIT license
+# http://www.opensource.org/licenses/mit-license.php
+"""
+Word lists for case sensitive/insensitive lookups
+
+Author: Jonas McCallum
+https://github.com/foobarmus/autocorrect
+
+"""
+from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1
+
+# en_US_GB_CA is a superset of US, GB and CA
+# spellings (color, colour, etc). It contains
+# roughly half a million words. For this
+# example, imagine it's just seven words...
+#
+# we (lower)
+# flew (lower)
+# to (lower)
+# Abu (mixed)
+# Dhabi (mixed)
+# via (lower)
+# Colombo (mixed)
+
+#LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt')
+LOWERCASE = words_from_archive1('bdict4.txt')
+# {'we', 'flew', 'to', 'via'}
+#just add the list of words of the language which you wish to add with lowercase (if its devnagari type  only this will suffice)
+#CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True)
+CASE_MAPPED = {}
+#  {abu': 'Abu',
+#  'dhabi': 'Dhabi',
+#  'colombo': 'Colombo'}
+#
+# Note that en_US_GB_CA_mixed.txt also contains
+# acronyms/mixed case variants of common words,
+# so in reality, CASE_MAPPED also contains: 
+#
+# {'to': 'TO',
+#  'via': 'Via'}
+
+MIXED_CASE = set(CASE_MAPPED.values())
+# {'Abu', 'Dhabi', 'Colombo'}
+
+LOWERED = set(CASE_MAPPED.keys())
+# {'abu', 'dhabi', 'colombo'}
diff --git a/autocorrect/words.bz2 b/autocorrect/words.bz2
diff --git a/build/lib/autocorrect/__init__.py b/build/lib/autocorrect/__init__.py
@@ -0,0 +1,39 @@
+# Python 3 Spelling Corrector
+#
+# Copyright 2014 Jonas McCallum.
+# Updated for Python 3, based on Peter Norvig's
+# 2007 version: http://norvig.com/spell-correct.html
+#
+# Open source, MIT license
+# http://www.opensource.org/licenses/mit-license.php
+"""
+Spell function
+Modified by Subrata Sarkar
+https://github.com/SubrataSarkar32
+Author: Jonas McCallum
+https://github.com/foobarmus/autocorrect
+
+"""
+from autocorrect.nlp_parser import NLP_COUNTS
+from autocorrect.nlp_parser_bn import NLP_COUNTS as NLP_COUNTS_BN
+from autocorrect.word import Word, common, exact, known, get_case
+
+def spell(word,language='en'):
+    """The language parameter takes into account of the language.
+       most likely correction for everything up to a double typo"""
+    if(language == 'en'):
+        w = Word(word)
+        candidates = (common([word]) or exact([word]) or known([word]) or
+                      known(w.typos()) or common(w.double_typos()) or
+                      [word])
+        correction = max(candidates, key=NLP_COUNTS.get)
+        return get_case(word, correction)
+    elif(language == 'bn'):
+        w = Word(word)
+        candidates = (common([word]) or exact([word]) or known([word]) or
+                      known(w.typos()) or common(w.double_typos()) or
+                      [word])
+        correction = max(candidates, key=NLP_COUNTS_BN.get)
+        return get_case(word, correction)
+    else:
+        raise ValueError("This language is not supported")
diff --git a/build/lib/autocorrect/nlp_parser.py b/build/lib/autocorrect/nlp_parser.py
@@ -0,0 +1,36 @@
+# Python 3 Spelling Corrector
+#
+# Copyright 2014 Jonas McCallum.
+# Updated for Python 3, based on Peter Norvig's
+# 2007 version: http://norvig.com/spell-correct.html
+#
+# Open source, MIT license
+# http://www.opensource.org/licenses/mit-license.php
+"""
+NLP parser
+
+Author: Jonas McCallum
+https://github.com/foobarmus/autocorrect
+
+"""
+from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1
+
+def parse(lang_sample):
+    """tally word popularity using novel extracts, etc"""
+    words = words_from_archive(lang_sample, include_dups=True)
+    counts = zero_default_dict()
+    for word in words:
+        counts[word] += 1
+    return set(words), counts
+
+def parse1(lang_sample):
+    """tally word popularity using novel extracts, etc"""
+    words = words_from_archive1(lang_sample, include_dups=True)
+    counts = zero_default_dict()
+    for word in words:
+        counts[word] += 1
+    return set(words), counts
+
+NLP_WORDS, NLP_COUNTS = parse('big.txt')
+#NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt')
+#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1