{twitter.com,
github.com,
youtube.com}/turicas
turicas.info
alvaro.justen@fgv.br
emap.fgv.br
github.com/NAMD
www.CursoDeArduino.com.br
Talk is cheap, show me the code!
>>> import nltk >>> sentence = """At eight o'clock on Thursday morning ... Arthur didn't feel very good.""" #42 >>> tokens = nltk.word_tokenize(sentence) >>> tokens ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
>>> tagged = nltk.pos_tag(tokens) >>> tagged [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), ('Arthur', 'NNP'), ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')]
>>> entities = nltk.chunk.ne_chunk(tagged) >>> entities Tree('S', [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), Tree('PERSON', [('Arthur', 'NNP')]), ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')])
# coding: utf-8 from pypelinin import Job, Pipeline pipeline = {Job('Extractor'): Job('Tokenizer'), Job('Tokenizer'): (Job('POS'), Job('FreqDist')), (Job('POS'), Job('FreqDist')): Job('Statistics')}
>>> with open('pipeline.dot', 'w') as dot_file: ... dot_file.write(pipeline.to_dot()) $ dot -Tpng -opipeline.png pipeline.dot # graphviz rules
# coding: utf-8 from pypelinin import Worker from nltk import word_tokenize, sent_tokenize class Tokenizer(Worker): requires = ['text'] def process(self, document): text = document['text'] tokens = word_tokenize(text) sentences = [] for sentence in sent_tokenize(text): sentences.append(word_tokenize(sentence)) return {'tokens': tokens, 'sentences': sentences}
?
{twitter.com,
github.com,
bitbucket.org,
youtube.com}/turicas
turicas.info
alvaro.justen@fgv.br
github.com/NAMD/pypln