Python: tudo o que voc� j� deveria saber sobre Unicode
�lvaro Justen aka Turicas
8� encontro da Comunidade Python Brasileira
Rio de Janeiro
23/11/2012
�lvaro Justen aka Turicas
8� encontro da Comunidade Python Brasileira
Rio de Janeiro
23/11/2012
{twitter.com,
github.com,
youtube.com}/turicas
turicas.info
alvarojusten@gmail.com
emap.fgv.br
github.com/NAMD
www.CursoDeArduino.com.br
.encode
: unicode
→ str
.decode
: str
→ unicode
>>> nome = 'álvaro' >>> nome2 = u'álvaro' >>> print len(nome), type(nome) >>> print nome[0], nome[1] >>> print len(nome2), type(nome2) >>> print nome2[0], nome2[1] >>> print nome == nome2 >>> print nome.decode('utf-8') == nome2 >>> print nome.upper(), type(nome.upper()) >>> maiusculo = nome.decode('utf-8').upper() >>> print maiusculo, type(maiusculo)
>>> for numero in range(128): ... print numero, chr(numero), ord(chr(numero))
'á'
:
>>> print 'á'.decode('utf-8').encode('ascii') Traceback (most recent call last): File "<stdin>", line 1, inUnicodeEncodeError: 'ascii' codec can't encode character u'\xe1' in position 0: ordinal not in range(128)
Tabela 1 | |
---|---|
Símbolo | Código |
A | 1 |
B | 2 |
C | 3 |
D | 4 |
Tabela 2 | |
---|---|
Símbolo | Código |
A | 4 |
B | 3 |
C | 2 |
D | 1 |
E | 5 |
>>> print 'python'.decode('utf-8').encode('ascii') >>> print 'abcdefghijklmnopqrstuvwxyz'.decode('cp1140') >>> print 'abcdefghijklmnopqrstuvwxyz'.decode('rot13') >>> print 'abcdefghijklmnopqrstuvwxyz'.encode('base64')
>>> print chr(226).decode('iso-8859-15') â >>> print chr(226).decode('iso-8859-7') β >>> print chr(226).decode('utf-8') Traceback (most recent call last): File "<stdin>", line 1, inFile "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) UnicodeDecodeError: 'utf8' codec can't decode byte 0xe2 in position 0: unexpected end of data
>>> from unicodedata import normalize >>> new_string = normalize('NFKD', u'Álvaro') >>> print new_string.encode('ascii', 'ignore') Alvaro
>>> print type(new_string), len(new_string) <type 'unicode'> 7 >>> print new_string.encode('ascii') Traceback (most recent call last): File "<stdin>", line 1, inUnicodeEncodeError: 'ascii' codec can't encode character u'\u0301' in position 1: ordinal not in range(128)
'errors'
:>>> print new_string.encode('ascii', 'replace') # default: 'strict' A?lvaro >>> print new_string.encode('ascii', 'xmlcharrefreplace') Álvaro >>> print new_string.encode('ascii', 'backslashreplace') A\u0301lvaro
>>> u'Álvaro' u'\xc1lvaro' >>> u'Álvaro'.encode('utf-8').encode('base64') 'w4FsdmFybw==\n' >>> 'w4FsdmFybw=='.decode('base64') '\xc3\x81lvaro' >>> 'w4FsdmFybw=='.decode('base64').decode('utf8') u'\xc1lvaro'
>>> eh = unichr(0x0065) + \ unichr(0x0301) >>> eh2 = unichr(0x00e9) >>> print eh, eh2 >>> print eh == eh2 # WTF? >>> print eh.encode('utf8') >>> print eh2.encode('utf8') >>> # composed >>> print normalize('NFC', eh) >>> print normalize('NFC', eh2) >>> # decomposed >>> print normalize('NFKD', eh) >>> print normalize('NFKD', eh2)
>>> len('álvaro') 7 >>> len(u'álvaro') 6
>>> len('álvaro') 6
unicode
→ str
str
→ bytes
álvaro = 'pythonista' # works! \o/
Ctrl
+ Shift
+
u
+ codepoint
chardet
pode ajudar unicode
(print
usa sys.stdout.encoding
quando
recebe objeto unicode
)
# coding: ...
(PEP-0263)
codecs.open
(Python 2) >>> print unicodedata.name(u'Á') LATIN CAPITAL LETTER A WITH ACUTE >>> nome = unicodedata.name(u'Á') >>> simbolo = unicodedata.lookup(nome) >>> print simbolo, type(simbolo) Á <type 'unicode'> >>> print u'\N{LATIN CAPITAL LETTER A WITH ACUTE}' Á
encodings
e codecs
(biblioteca padrão)
u'\u03b2'
) ?
{twitter.com,
github.com,
youtube.com}/turicas
turicas.info
alvarojusten@gmail.com
turicas.info/slides/python-unicode