Advanced unicode handling for rfc1149

This commit is contained in:
Sebastian Lohff 2011-12-23 03:14:00 +01:00
parent 1662510847
commit f383928f9b
2 changed files with 52 additions and 7 deletions

View File

@ -5,7 +5,6 @@ import base64
import collections import collections
import logging import logging
import math import math
import re
import sys import sys
import threading import threading
import time import time
@ -25,15 +24,19 @@ class TwittStreamHandler(tweepy.StreamListener):
self.fragments = collections.defaultdict(str) self.fragments = collections.defaultdict(str)
def on_status( self, status ): def on_status( self, status ):
#print '-' * 20 """ On statis, decode and reassemble packet-status-texts. If complete, write them to the tun-dev. """
#print "incoming:", unicode(status.text), "from", dir(status)
#print "hex:", binToHexStr(status.text), len(status.text) # Twitter breaks some of the unicode characters, so we need to reassemble them.
# reassemble messages, write them to dev when complete # Note that through this packets containing the following bytes will get mangled:
# 00 5c 00 XX 00 XX 00 XX 00 5c 00 XX 00 XX 00 XX 00 5c 00 XX 00 XX 00 XX
# while XX is a number from ord('0') to ord('9')
sourcePacket = UPHelper.reassembleBrokenChars(status.text)
print "in uni:", repr(sourcePacket)
(isFragment, packetLen, packetId, packet) = None, None, None, None (isFragment, packetLen, packetId, packet) = None, None, None, None
try: try:
(isFragment, packetLen, packetId, packet) = UPHelper.decode(status.text) (isFragment, packetLen, packetId, packet) = UPHelper.decode(sourcePacket)
except ValueError, e: except ValueError, e:
print "Could not decode tweet, omitting (Error was: %s).\n\tText was: %s" % (e, status.text) print "Could not decode tweet, omitting (Error was: %s).\n\tText was: %s" % (e, repr(sourcePacket))
raise raise
return return
#print "Parsed packet:", (isFragment, packetLen, packetId) #print "Parsed packet:", (isFragment, packetLen, packetId)
@ -147,6 +150,8 @@ if not Conf['twitter']['ACCESS_KEY']:
def sendToNet(self, packet): def sendToNet(self, packet):
fragments = UPHelper.encode(packet) fragments = UPHelper.encode(packet)
print "out raw:", repr(packet)
print "out frag:", repr(fragments[0])
print " >> Sending out %d bytes in %d tweet%s" % (len(packet), len(fragments), len(fragments)!=1 and "s" or "") print " >> Sending out %d bytes in %d tweet%s" % (len(packet), len(fragments), len(fragments)!=1 and "s" or "")
for fragment in fragments: for fragment in fragments:
# FIXME: catch tweepy.error.TweepError # FIXME: catch tweepy.error.TweepError

View File

@ -3,6 +3,7 @@
import bitarray import bitarray
import random import random
import re
class UPHelper(): class UPHelper():
""" The Unicode Packet Helper """ The Unicode Packet Helper
@ -87,11 +88,50 @@ class UPHelper():
fragments[y] = ret fragments[y] = ret
return fragments return fragments
@staticmethod
def reassembleBrokenChars(packet):
""" Reassemble broken characters back to unicode.
Twitter breaks some characters (currently known range is 0xd800 - 0xdfff)
into r"\XXX\XXX\XXX", X being octal numbers. These are actually strings,
so one unicodechar from the range gets broken up to 12 chars.
Also _some_ of these are again converted into other chars.
\ud800\udc00 gets converted to \U00010000, so we need to guess-convert
these back. """
origPacket = packet
brokenChars = re.findall(r"(\\([0-9]{3})\\([0-9]{3})\\([0-9]{3}))", packet)
for broken in brokenChars:
#print "broken", broken, repr("".join(map(lambda x: chr(int(x, 8)), broken[1:])))
newChar = "".join(map(lambda x: chr(int(x, 8)), broken[1:])).decode("utf-8")
packet = packet.replace(broken[0], newChar)
# this is guesswork-derivation, its derived from these lines
# they represent our input and twitters output
# guesswork++: for the header, this is plausible, afterwards not.
# u"\ud900\udc00 \uda00\udcFF \udb00\uddFF"
# u'\U00050000 \U000900ff \U000d01ff"
# u"\ud800\udc00 \ud800\udcFF \ud800\uddFF \ud800\ude00 \ud800\udeff \ud800\udf00 \ud800\udfff"
# u'\U00010000 \U000100ff \U000101ff \U00010200 \U000102ff \U00010300 \U000103ff'
# u"\ud800\udc00 \ud801\udc00 \ud802\udc00 \ud803\udc00 \ud804\udc00 \ud805\udc00\ud806\udc00 \ud807\udc00 \ud808\udc00 \ud809\udc00 \ud80a\udc00 \ud80b\udc00 \ud80c\udc00 \ud80d\udc00 \ud80e\udc00 \ud80f\udc00 \ud810\udc00 \ud811\udc00 \ud812\udc00 \ud813\udc00"
# u'\U00010000 \U00010400 \U00010800 \U00010c00 \U00011000 \U00011400\U00011800 \U00011c00 \U00012000 \U00012400 \U00012800 \U00012c00 \U00013000 \U00013400 \U00013800 \U00013c00 \U00014000 \U00014400 \U00014800 \U00014c00'
for c in origPacket[11:]:
o = ord(c)
if o > 65535:
# -.-
a = unichr(0xd800 + ((o >> 10) - 64))
b = unichr(0xdc00 + (o & 1023))
packet = packet.replace(c, a+b)
return packet
@staticmethod @staticmethod
def decode(packet): def decode(packet):
""" Decodes an unicodestring (packet) back to header + data """ Decodes an unicodestring (packet) back to header + data
Returns: tupel(isFragmented, packetLen, packetId, data) """ Returns: tupel(isFragmented, packetLen, packetId, data) """
if len(packet) < 11: if len(packet) < 11:
raise ValueError("This is not a valid packet, header is too short (should be at least 11, is %d)" % len(packet)) raise ValueError("This is not a valid packet, header is too short (should be at least 11, is %d)" % len(packet))
header = bitarray.bitarray() header = bitarray.bitarray()