{twitter.com,
github.com,
youtube.com}/turicas
turicas.info
alvaro.justen@fgv.br
emap.fgv.br
github.com/NAMD
www.CursoDeArduino.com.br
Talk is cheap, show me the code!
>>> import nltk >>> sentence = """At eight o'clock on Thursday morning ... Arthur didn't feel very good.""" #42 >>> tokens = nltk.word_tokenize(sentence) >>> tokens ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
>>> tagged = nltk.pos_tag(tokens)
>>> tagged
[('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
('Thursday', 'NNP'), ('morning', 'NN'), ('Arthur', 'NNP'),
('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'),
('good', 'JJ'), ('.', '.')]
>>> entities = nltk.chunk.ne_chunk(tagged)
>>> entities
Tree('S', [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'),
('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'),
Tree('PERSON', [('Arthur', 'NNP')]),
('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'),
('very', 'RB'), ('good', 'JJ'), ('.', '.')])
# coding: utf-8
from pypelinin import Job, Pipeline
pipeline = {Job('Extractor'): Job('Tokenizer'),
Job('Tokenizer'): (Job('POS'), Job('FreqDist')),
(Job('POS'), Job('FreqDist')): Job('Statistics')}
>>> with open('pipeline.dot', 'w') as dot_file:
... dot_file.write(pipeline.to_dot())
$ dot -Tpng -opipeline.png pipeline.dot # graphviz rules
# coding: utf-8
from pypelinin import Worker
from nltk import word_tokenize, sent_tokenize
class Tokenizer(Worker):
requires = ['text']
def process(self, document):
text = document['text']
tokens = word_tokenize(text)
sentences = []
for sentence in sent_tokenize(text):
sentences.append(word_tokenize(sentence))
return {'tokens': tokens, 'sentences': sentences}
?
{twitter.com,
github.com,
bitbucket.org,
youtube.com}/turicas
turicas.info
alvaro.justen@fgv.br
github.com/NAMD/pypln