Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
===========
autocorrect
===========

Python 3 Spelling Corrector
Supports English and Bengali
=======
Python 3 Spelling Corrector for bengali


Installation
============
.. code-block:: bash

pip install autocorrect
pip install https://github.com/SubrataSarkar32/autocorrect.git

Examples
========
Expand All @@ -16,6 +21,8 @@ Examples
>>> from autocorrect import spell
>>> spell('HTe')
'The'
#for bengali
>>> spell('কখন',language='bn')

Contribute
==========
Expand Down
16 changes: 16 additions & 0 deletions autocorrect.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Metadata-Version: 2.1
Name: autocorrect
Version: 0.3.0
Summary: Python 3 Spelling Corrector
Home-page: https://github.com/phatpiglet/autocorrect/
Author: Jonas McCallum
Author-email: jonasmccallum@gmail.com
License: http://www.opensource.org/licenses/mit-license.php
Keywords: autocorrect spelling corrector
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Natural Language :: English
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
License-File: LICENSE
15 changes: 15 additions & 0 deletions autocorrect.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
LICENSE
README.rst
setup.py
autocorrect/__init__.py
autocorrect/nlp_parser.py
autocorrect/nlp_parser_bn.py
autocorrect/utils.py
autocorrect/word.py
autocorrect/word_lists.py
autocorrect/word_lists_bn.py
autocorrect/words.bz2
autocorrect.egg-info/PKG-INFO
autocorrect.egg-info/SOURCES.txt
autocorrect.egg-info/dependency_links.txt
autocorrect.egg-info/top_level.txt
1 change: 1 addition & 0 deletions autocorrect.egg-info/dependency_links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions autocorrect.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
autocorrect
31 changes: 22 additions & 9 deletions autocorrect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,32 @@
# http://www.opensource.org/licenses/mit-license.php
"""
Spell function

Modified by Subrata Sarkar
https://github.com/SubrataSarkar32
Author: Jonas McCallum
https://github.com/foobarmus/autocorrect

"""
from autocorrect.nlp_parser import NLP_COUNTS
from autocorrect.nlp_parser_bn import NLP_COUNTS as NLP_COUNTS_BN
from autocorrect.word import Word, common, exact, known, get_case

def spell(word):
"""most likely correction for everything up to a double typo"""
w = Word(word)
candidates = (common([word]) or exact([word]) or known([word]) or
known(w.typos()) or common(w.double_typos()) or
[word])
correction = max(candidates, key=NLP_COUNTS.get)
return get_case(word, correction)
def spell(word,language='en'):
"""The language parameter takes into account of the language.
most likely correction for everything up to a double typo"""
if(language == 'en'):
w = Word(word)
candidates = (common([word]) or exact([word]) or known([word]) or
known(w.typos()) or common(w.double_typos()) or
[word])
correction = max(candidates, key=NLP_COUNTS.get)
return get_case(word, correction)
elif(language == 'bn'):
w = Word(word)
candidates = (common([word]) or exact([word]) or known([word]) or
known(w.typos()) or common(w.double_typos()) or
[word])
correction = max(candidates, key=NLP_COUNTS_BN.get)
return get_case(word, correction)
else:
raise ValueError("This language is not supported")
Binary file added autocorrect/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added autocorrect/__pycache__/nlp_parser.cpython-36.pyc
Binary file not shown.
Binary file added autocorrect/__pycache__/utils.cpython-36.pyc
Binary file not shown.
Binary file added autocorrect/__pycache__/word.cpython-36.pyc
Binary file not shown.
Binary file added autocorrect/__pycache__/word_lists.cpython-36.pyc
Binary file not shown.
12 changes: 11 additions & 1 deletion autocorrect/nlp_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
https://github.com/foobarmus/autocorrect

"""
from autocorrect.utils import words_from_archive, zero_default_dict
from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1

def parse(lang_sample):
"""tally word popularity using novel extracts, etc"""
Expand All @@ -23,4 +23,14 @@ def parse(lang_sample):
counts[word] += 1
return set(words), counts

def parse1(lang_sample):
"""tally word popularity using novel extracts, etc"""
words = words_from_archive1(lang_sample, include_dups=True)
counts = zero_default_dict()
for word in words:
counts[word] += 1
return set(words), counts

NLP_WORDS, NLP_COUNTS = parse('big.txt')
#NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt')
#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1
36 changes: 36 additions & 0 deletions autocorrect/nlp_parser_bn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Python 3 Spelling Corrector
#
# Copyright 2014 Jonas McCallum.
# Updated for Python 3, based on Peter Norvig's
# 2007 version: http://norvig.com/spell-correct.html
#
# Open source, MIT license
# http://www.opensource.org/licenses/mit-license.php
"""
NLP parser

Author: Jonas McCallum
https://github.com/foobarmus/autocorrect

"""
from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1

def parse(lang_sample):
"""tally word popularity using novel extracts, etc"""
words = words_from_archive(lang_sample, include_dups=True)
counts = zero_default_dict()
for word in words:
counts[word] += 1
return set(words), counts

def parse1(lang_sample):
"""tally word popularity using novel extracts, etc"""
words = words_from_archive1(lang_sample, include_dups=True)
counts = zero_default_dict()
for word in words:
counts[word] += 1
return set(words), counts

#NLP_WORDS_1, NLP_COUNTS_1 = parse('big.txt')
NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt')
#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1
32 changes: 30 additions & 2 deletions autocorrect/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,42 @@
https://github.com/foobarmus/autocorrect

"""
import re, os, tarfile
import re, os, tarfile, io
from contextlib import closing
from itertools import chain

PATH = os.path.abspath(os.path.dirname(__file__))
BZ2 = 'words.bz2'
RE = '[A-Za-z]+'
RE1 = '[\w\s]+'

def words_from_file(filename, include_dups=False, map_case=False):
filepath=os.path.join(PATH, filename)
with io.open(filepath,'r',encoding='utf8') as f:
text = f.read()
words = text.split()
if include_dups:
return words
elif map_case:
return words
else:
return set(words)

def words_from_archive1(filename, include_dups=False, map_case=False):
"""extract words from a text file in the archive"""
bz2 = os.path.join(PATH, BZ2)
tar_path = '{}/{}'.format('words', filename)
with closing(tarfile.open(bz2, 'r:bz2')) as t:
with closing(t.extractfile(tar_path)) as f:
words = re.findall(RE1, f.read().decode(encoding='utf-8'))
if include_dups:
return words
elif map_case:
return {w.lower():w for w in words}
else:
return set(words)


def words_from_archive(filename, include_dups=False, map_case=False):
"""extract words from a text file in the archive"""
bz2 = os.path.join(PATH, BZ2)
Expand All @@ -44,7 +72,7 @@ def concat(*args):

class Zero(dict):
"""dict with a zero default"""

def __getitem__(self, key):
return self.get(key)

Expand Down
2 changes: 1 addition & 1 deletion autocorrect/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from autocorrect.word_lists import LOWERCASE, MIXED_CASE
from autocorrect.word_lists import LOWERED, CASE_MAPPED

ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
ALPHABET = '''abcdefghijklmnopqrstuvwxyzঅআইঈউঊঋঌএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভময়সশষহযরড়ঢ়লৎঁংঃ্৷ািীুূৃেৈোৌ'''
KNOWN_WORDS = LOWERCASE | LOWERED | NLP_WORDS

class Word(object):
Expand Down
9 changes: 5 additions & 4 deletions autocorrect/word_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
https://github.com/foobarmus/autocorrect

"""
from autocorrect.utils import words_from_archive
from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1

# en_US_GB_CA is a superset of US, GB and CA
# spellings (color, colour, etc). It contains
Expand All @@ -29,10 +29,11 @@
# Colombo (mixed)

LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt')
#LOWERCASE = words_from_archive1('bdict4.txt')
# {'we', 'flew', 'to', 'via'}

CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt',
map_case=True)
#just add the list of words of the language which you wish to add with lowercase (if its devnagari type only this will suffice)
CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True)
CASE_MAPPED = {}
# {abu': 'Abu',
# 'dhabi': 'Dhabi',
# 'colombo': 'Colombo'}
Expand Down
52 changes: 52 additions & 0 deletions autocorrect/word_lists_bn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Python 3 Spelling Corrector
#
# Copyright 2014 Jonas McCallum.
# Updated for Python 3, based on Peter Norvig's
# 2007 version: http://norvig.com/spell-correct.html
#
# Open source, MIT license
# http://www.opensource.org/licenses/mit-license.php
"""
Word lists for case sensitive/insensitive lookups

Author: Jonas McCallum
https://github.com/foobarmus/autocorrect

"""
from autocorrect.utils import words_from_archive,words_from_file,words_from_archive1

# en_US_GB_CA is a superset of US, GB and CA
# spellings (color, colour, etc). It contains
# roughly half a million words. For this
# example, imagine it's just seven words...
#
# we (lower)
# flew (lower)
# to (lower)
# Abu (mixed)
# Dhabi (mixed)
# via (lower)
# Colombo (mixed)

#LOWERCASE = words_from_archive('en_US_GB_CA_lower.txt')
LOWERCASE = words_from_archive1('bdict4.txt')
# {'we', 'flew', 'to', 'via'}
#just add the list of words of the language which you wish to add with lowercase (if its devnagari type only this will suffice)
#CASE_MAPPED = words_from_archive('en_US_GB_CA_mixed.txt', map_case=True)
CASE_MAPPED = {}
# {abu': 'Abu',
# 'dhabi': 'Dhabi',
# 'colombo': 'Colombo'}
#
# Note that en_US_GB_CA_mixed.txt also contains
# acronyms/mixed case variants of common words,
# so in reality, CASE_MAPPED also contains:
#
# {'to': 'TO',
# 'via': 'Via'}

MIXED_CASE = set(CASE_MAPPED.values())
# {'Abu', 'Dhabi', 'Colombo'}

LOWERED = set(CASE_MAPPED.keys())
# {'abu', 'dhabi', 'colombo'}
Binary file modified autocorrect/words.bz2
Binary file not shown.
39 changes: 39 additions & 0 deletions build/lib/autocorrect/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Python 3 Spelling Corrector
#
# Copyright 2014 Jonas McCallum.
# Updated for Python 3, based on Peter Norvig's
# 2007 version: http://norvig.com/spell-correct.html
#
# Open source, MIT license
# http://www.opensource.org/licenses/mit-license.php
"""
Spell function
Modified by Subrata Sarkar
https://github.com/SubrataSarkar32
Author: Jonas McCallum
https://github.com/foobarmus/autocorrect

"""
from autocorrect.nlp_parser import NLP_COUNTS
from autocorrect.nlp_parser_bn import NLP_COUNTS as NLP_COUNTS_BN
from autocorrect.word import Word, common, exact, known, get_case

def spell(word,language='en'):
"""The language parameter takes into account of the language.
most likely correction for everything up to a double typo"""
if(language == 'en'):
w = Word(word)
candidates = (common([word]) or exact([word]) or known([word]) or
known(w.typos()) or common(w.double_typos()) or
[word])
correction = max(candidates, key=NLP_COUNTS.get)
return get_case(word, correction)
elif(language == 'bn'):
w = Word(word)
candidates = (common([word]) or exact([word]) or known([word]) or
known(w.typos()) or common(w.double_typos()) or
[word])
correction = max(candidates, key=NLP_COUNTS_BN.get)
return get_case(word, correction)
else:
raise ValueError("This language is not supported")
36 changes: 36 additions & 0 deletions build/lib/autocorrect/nlp_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Python 3 Spelling Corrector
#
# Copyright 2014 Jonas McCallum.
# Updated for Python 3, based on Peter Norvig's
# 2007 version: http://norvig.com/spell-correct.html
#
# Open source, MIT license
# http://www.opensource.org/licenses/mit-license.php
"""
NLP parser

Author: Jonas McCallum
https://github.com/foobarmus/autocorrect

"""
from autocorrect.utils import words_from_archive, zero_default_dict,words_from_file,words_from_archive1

def parse(lang_sample):
"""tally word popularity using novel extracts, etc"""
words = words_from_archive(lang_sample, include_dups=True)
counts = zero_default_dict()
for word in words:
counts[word] += 1
return set(words), counts

def parse1(lang_sample):
"""tally word popularity using novel extracts, etc"""
words = words_from_archive1(lang_sample, include_dups=True)
counts = zero_default_dict()
for word in words:
counts[word] += 1
return set(words), counts

NLP_WORDS, NLP_COUNTS = parse('big.txt')
#NLP_WORDS, NLP_COUNTS = parse1('bengnovasssh.txt')
#NLP_WORDS, NLP_COUNTS = set(list(NLP_WORDS_1) + list(NLP_WORDS_2)), NLP_COUNTS_1
Loading