import nltk.tokenize as tk doc = "Are you curious about tokenization? " \ "Let's see how it works! " \ "We need to analyze a couple of sentences " \ "with punctuations to see it in action." print(doc) tokens = tk.sent_tokenize(doc) for i, token inenumerate(tokens): print("%2d" % (i + 1), token) print('-' * 15) tokens = tk.word_tokenize(doc) for i, token inenumerate(tokens): print("%2d" % (i + 1), token) print('-' * 15) tokenizer = tk.WordPunctTokenizer() tokens = tokenizer.tokenize(doc) for i, token inenumerate(tokens): print("%2d" % (i + 1), token)
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft doc = 'This hotel is very bad. The toilet in this hotel smells bad. The environment of this hotel is very good.' print(doc) sentences = tk.sent_tokenize(doc) print(sentences) cv = ft.CountVectorizer() bow = cv.fit_transform(sentences).toarray() print(bow) words = cv.get_feature_names() print(words)
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft import sklearn.preprocessing as sp doc = 'This hotel is very bad. The toilet in this hotel smells bad. The environment of this hotel is very good.' print(doc) sentences = tk.sent_tokenize(doc) print(sentences) cv = ft.CountVectorizer() bow = cv.fit_transform(sentences).toarray() print(bow) words = cv.get_feature_names() print(words) tf = sp.normalize(bow, norm='l1') print(tf)