From f383928f9bf8ab592213c2a9ec919f41906da2a6 Mon Sep 17 00:00:00 2001 From: Sebastian Lohff Date: Fri, 23 Dec 2011 03:14:00 +0100 Subject: [PATCH] Advanced unicode handling for rfc1149 --- tunnel/rfc1149/rfc1149.py | 19 +++++++++++------- tunnel/rfc1149/uphelper.py | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/tunnel/rfc1149/rfc1149.py b/tunnel/rfc1149/rfc1149.py index 2dea1e5..7d4a491 100755 --- a/tunnel/rfc1149/rfc1149.py +++ b/tunnel/rfc1149/rfc1149.py @@ -5,7 +5,6 @@ import base64 import collections import logging import math -import re import sys import threading import time @@ -25,15 +24,19 @@ class TwittStreamHandler(tweepy.StreamListener): self.fragments = collections.defaultdict(str) def on_status( self, status ): - #print '-' * 20 - #print "incoming:", unicode(status.text), "from", dir(status) - #print "hex:", binToHexStr(status.text), len(status.text) - # reassemble messages, write them to dev when complete + """ On statis, decode and reassemble packet-status-texts. If complete, write them to the tun-dev. """ + + # Twitter breaks some of the unicode characters, so we need to reassemble them. + # Note that through this packets containing the following bytes will get mangled: + # 00 5c 00 XX 00 XX 00 XX 00 5c 00 XX 00 XX 00 XX 00 5c 00 XX 00 XX 00 XX + # while XX is a number from ord('0') to ord('9') + sourcePacket = UPHelper.reassembleBrokenChars(status.text) + print "in uni:", repr(sourcePacket) (isFragment, packetLen, packetId, packet) = None, None, None, None try: - (isFragment, packetLen, packetId, packet) = UPHelper.decode(status.text) + (isFragment, packetLen, packetId, packet) = UPHelper.decode(sourcePacket) except ValueError, e: - print "Could not decode tweet, omitting (Error was: %s).\n\tText was: %s" % (e, status.text) + print "Could not decode tweet, omitting (Error was: %s).\n\tText was: %s" % (e, repr(sourcePacket)) raise return #print "Parsed packet:", (isFragment, packetLen, packetId) @@ -147,6 +150,8 @@ if not Conf['twitter']['ACCESS_KEY']: def sendToNet(self, packet): fragments = UPHelper.encode(packet) + print "out raw:", repr(packet) + print "out frag:", repr(fragments[0]) print " >> Sending out %d bytes in %d tweet%s" % (len(packet), len(fragments), len(fragments)!=1 and "s" or "") for fragment in fragments: # FIXME: catch tweepy.error.TweepError diff --git a/tunnel/rfc1149/uphelper.py b/tunnel/rfc1149/uphelper.py index a6e9b11..395d74c 100644 --- a/tunnel/rfc1149/uphelper.py +++ b/tunnel/rfc1149/uphelper.py @@ -3,6 +3,7 @@ import bitarray import random +import re class UPHelper(): """ The Unicode Packet Helper @@ -87,11 +88,50 @@ class UPHelper(): fragments[y] = ret return fragments + @staticmethod + def reassembleBrokenChars(packet): + """ Reassemble broken characters back to unicode. + + Twitter breaks some characters (currently known range is 0xd800 - 0xdfff) + into r"\XXX\XXX\XXX", X being octal numbers. These are actually strings, + so one unicodechar from the range gets broken up to 12 chars. + + Also _some_ of these are again converted into other chars. + \ud800\udc00 gets converted to \U00010000, so we need to guess-convert + these back. """ + origPacket = packet + brokenChars = re.findall(r"(\\([0-9]{3})\\([0-9]{3})\\([0-9]{3}))", packet) + for broken in brokenChars: + #print "broken", broken, repr("".join(map(lambda x: chr(int(x, 8)), broken[1:]))) + newChar = "".join(map(lambda x: chr(int(x, 8)), broken[1:])).decode("utf-8") + packet = packet.replace(broken[0], newChar) + + # this is guesswork-derivation, its derived from these lines + # they represent our input and twitters output + # guesswork++: for the header, this is plausible, afterwards not. + # u"\ud900\udc00 \uda00\udcFF \udb00\uddFF" + # u'\U00050000 \U000900ff \U000d01ff" + # u"\ud800\udc00 \ud800\udcFF \ud800\uddFF \ud800\ude00 \ud800\udeff \ud800\udf00 \ud800\udfff" + # u'\U00010000 \U000100ff \U000101ff \U00010200 \U000102ff \U00010300 \U000103ff' + # u"\ud800\udc00 \ud801\udc00 \ud802\udc00 \ud803\udc00 \ud804\udc00 \ud805\udc00\ud806\udc00 \ud807\udc00 \ud808\udc00 \ud809\udc00 \ud80a\udc00 \ud80b\udc00 \ud80c\udc00 \ud80d\udc00 \ud80e\udc00 \ud80f\udc00 \ud810\udc00 \ud811\udc00 \ud812\udc00 \ud813\udc00" + # u'\U00010000 \U00010400 \U00010800 \U00010c00 \U00011000 \U00011400\U00011800 \U00011c00 \U00012000 \U00012400 \U00012800 \U00012c00 \U00013000 \U00013400 \U00013800 \U00013c00 \U00014000 \U00014400 \U00014800 \U00014c00' + + + for c in origPacket[11:]: + o = ord(c) + if o > 65535: + # -.- + a = unichr(0xd800 + ((o >> 10) - 64)) + b = unichr(0xdc00 + (o & 1023)) + packet = packet.replace(c, a+b) + return packet + @staticmethod def decode(packet): """ Decodes an unicodestring (packet) back to header + data Returns: tupel(isFragmented, packetLen, packetId, data) """ + if len(packet) < 11: raise ValueError("This is not a valid packet, header is too short (should be at least 11, is %d)" % len(packet)) header = bitarray.bitarray()