text = """Mr. Santa was busy today; But he didn't have that much work. "Hii, how are you doing? good work today!", he said."""
text2 = "such a long sentence that it proba-\
bly won't fit in a single line, but here's an intra-word hyphen too! These words will not be seperated: combinedWords combinedlongwords"
text3 = "Hiii! :D how are y'all doin' 😀. #DailyTweet @DailyTweeter full-stops.... and a link!!? https://imgur.com/gallery/y8u0gg8. In comparision to:these f123@gmail.com a word i made up is ran'om cuz i cant spel"
text_hi = "राम-श्याम बहुत ही सामान्य नाम हैं। शनैः शनैः दिन ढल गया। अतः यह सिद्ध होता है कि सूर्य पूर्व निकलता है।"
text.split()
text.split(". ")
import re
rx = r"\w+(?:'\w+)?|[^\w\s]"
print(re.findall(rx, text))
print(re.findall(rx, text2)) #notice "didn't" is a single word
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
print(sent_tokenize(text))
print(word_tokenize(text), "\n")
print(sent_tokenize(text2))
print(word_tokenize(text2), "\n")
print(sent_tokenize(text3))
print(word_tokenize(text3)) #cant is single word here, link is seperated
!pip install -U spacy
!python -m spacy download en
# Load tokenizer, tagger, parser, NER and word vectors
from spacy.lang.hi import Hindi
from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')
# "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)
my_doc2 = nlp(text2)
my_doc3 = nlp(text3)
for sent in my_doc.sents:
print(sent.text)
for sent in my_doc2.sents:
print(sent.text)
token_list = []
for token in my_doc:
token_list.append(token.text)
print(token_list)
token_list = []
for token in my_doc2:
token_list.append(token.text)
print(token_list)
token_list = []
for token in my_doc3:
token_list.append(token.text)
token_list
nlp = Hindi()
nlp.add_pipe('sentencizer')
my_doc_hi = nlp(text_hi)
for sent in my_doc_hi.sents:
print(sent.text)
token_list = []
for token in my_doc_hi:
token_list.append(token.text)
print(token_list)
!pip install indic-nlp-library
from indicnlp.tokenize import indic_tokenize
indic_string= text_hi
indic_tokenize.trivial_tokenize(indic_string)
!pip install stanza
import stanza
stanza.download('en')
stanza.download('hi')
stanza.download('ko')
stanza.download('ja')
nlpEn = stanza.Pipeline(lang='en', processors='tokenize')
doc = nlpEn(text)
doc2 = nlpEn(text2)
doc3 = nlpEn(text3)
for i, sentence in enumerate(doc.sentences):
print(sentence.text)
for i, sentence in enumerate(doc2.sentences):
print(sentence.text)
for i, sentence in enumerate(doc2.sentences):
print(f'====== Sentence {i+1} tokens =======')
print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')
for i, sentence in enumerate(doc3.sentences):
print(f'====== Sentence {i+1} tokens =======')
print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')
nlpHi = stanza.Pipeline(lang='hi', processors='tokenize')
doc = nlpHi(text_hi)
for i, sentence in enumerate(doc.sentences):
print(f'====== Sentence {i+1} tokens =======')
print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')
nlpJa = stanza.Pipeline(lang='ja', processors='tokenize')
doc = nlpJa('これは小さな文章です。これは別の小さな文です.')
for i, sentence in enumerate(doc.sentences):
print(f'====== Sentence {i+1} tokens =======')
print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')
nlpKo = stanza.Pipeline(lang='ko', processors='tokenize')
doc = nlpKo('이것은 문장입니다. 이것은 다른 문장입니다.')
for i, sentence in enumerate(doc.sentences):
print(f'====== Sentence {i+1} tokens =======')
print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n') # 입니다 can be further tokenized
from random import sample
nltk.download('words')
from nltk.corpus import words
lowercaseCorpus = [x.lower() for x in words.words()]
print(len(lowercaseCorpus))
print(sample(lowercaseCorpus,10))
def maxMatch(bigword):
tokens = []
i = 0
while i < len(bigword):
maxWord = ""
for j in range(i, len(bigword)):
tempWord = bigword[i:j+1]
if tempWord in lowercaseCorpus and len(tempWord) > len(maxWord):
maxWord = tempWord
i = i+len(maxWord)
tokens.append(maxWord)
print(tokens)
maxMatch("combinedwordhereforthealgorithm")
Bert-WordPiece
!pip install tokenizers
from tokenizers import BertWordPieceTokenizer
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
output = tokenizer.encode_batch([text, text2, text3])
print(output[2].tokens)
output = tokenizer.encode_batch(["averybigcombinedwordforthealgorithm"])
print(output[0].tokens)
What to do if you do not have any module for your language? i.e. a Low resource languages.