Python: mmap and array

我需要在Python程序中存取一个很大的数组,数组的每一项是(int, int, float, int)的记录。如果直接用list来存放,占据的内存巨大(因为不仅所有这些数都是对象,且tuple本身也是对象)。Python提供了一个array模块,以更有效地存取数字值,但是它只支持单一的数据类型,例如你无法创建这样的array对象:a = array.array('2lfl')。

我想到了存放在文件中,并用mmap的方式来访问。除了mmap,我不知道Python中是否还有其他方法可以得到一块raw的内存。且mmap在性能和效率上,有一定的优越性。最后,辗转得到了下面的代码:

class MMArray:
    __file = __mem = None
    __realsize = __capsize = 0

    def __init__(self, type='B', fname=None, capsize=1024*1024):
        self.__elmsize = struct.calcsize(type)

        if not fname:
            fno, self.__fname = tempfile.mkstemp("-mmarray", "pyslm-")
            self.__file = os.fdopen (fno, "w+")
            self.__enlarge(capsize)
        else:
            self.fromfile(fname)

    def fromfile(self, fname):
        if not os.path.exists(fname):
            raise "The file '%s' does not exist!"

        fsize = os.path.getsize(fname)
        if fsize == 0:
            raise "The size of file '%s' is zero!" % fname

        if self.__mem: self.__mem.close()
        if self.__file: self.__file.close()

        self.__file = open (fname, "r+")
        self.__mem = mmap.mmap(self.__file.fileno(), fsize)
        self.__realsize = self.__capsize = fsize/self.__elmsize

    def tofile(self, fname):
        if fname == self.__file.name:
            raise "Can not dump the array to currently mapping file!"
        tf = open(fname, "w+")
        bsize = self.__realsize * self.__elmsize
        tf.write (self.__mem[:bsize])
        tf.close()

    def __enlarge(self, capsize):
        if self.__capsize >= capsize:
            return
        
        self.__capsize = capsize
        self.__file.seek(self.__elmsize * self.__capsize - 1)
        self.__file.write('')
        self.__file.flush()

        if (self.__mem): self.__mem.close()
        self.__mem = mmap.mmap(self.__file.fileno(), self.__file.tell())

    def __del__ (self):
        bsize = self.__realsize * self.__elmsize
        self.__file.truncate (bsize)
        self.__file.close()
        if self.__mem: self.__mem.close()
        os.remove(self.__fname)

    def __getitem__(self, idx):
        if idx < 0 or idx >= self.__realsize:
            raise IndexError
        return self.__access(idx)

    def __setitem__(self, idx, buf):
        if idx < 0 or idx >= self.__realsize:
            raise IndexError
        if type(buf) != type("") or len(buf) != self.__elmsize:
            raise "Not a string, or the buffer size is incorrect!"
        self.__access(idx, buf)

    def __access (self, idx, buf=None):
        start = idx * self.__elmsize
        end = start + self.__elmsize
        if not buf: return self.__mem[start:end]
        self.__mem[start:end] = buf

    def size(self):
        return self.__realsize

    def append(self, buf):
        if type(buf) != type("") or len(buf) != self.__elmsize:
            raise "Not a string, or the buffer size is incorrect!"

        if self.__realsize >= self.__capsize:
            self.__enlarge(self.__capsize*2)

        self.__access(self.__realsize, buf)
        self.__realsize += 1

    def __iter__(self):
        for i in xrange(0, self.__realsize):
            yield self.__access(i)

    def truncate(self, tsize):
        if self.__realsize >= tsize:
            self.__realsize = tsize

当然,还有许多要改进的地方,例如支持从尾部索引(即index<0),以及slicing等等。

python-rbtree和内建dict的性能比较


python
内建的dict(字典)类使用的是hash算法,因此它的key不是有序的。而C++中的std::map或std::set使用的是平衡二叉树(通常为红黑树),其key是有序的。在网上搜了搜,找到了一个用C和pyrex混合实现的红黑树模块,python-rbtree

我编写了一个极简单的测试程序,在Solaris x86 + python 2.4.4平台上运行,分别使用dict和rbtree,插入两百万个记录(key是3个整型,value是1个整型,你大概猜到我在干什么了吧 :))。且在dict插入完之后,调用dict.keys().sort()对其key进行排序(也就是快排)。比较的结果是,两种方法使用的内存相当(大概在200M左右)。但是hash算法的速度要快一倍以上。当记录个数增加到五百万个时,结果还是差不多──即内存使用相当,hash算法快一倍。

至少在这个数量级上,内建的dict性能更佳。我还尝试了另一个纯Python的红黑树实现--RBTree.py,结果令人失望,在记录个数比较多的情况下,似乎根本无法得到正确的结果。

结论,python中的dict是可信赖的!

A simple stript to extract the contents from Sogou corpus

I wrote a simple python script to extract the contents from Sogou corpus.

#!/usr/bin/python

import codecs
import sys

usage = """
Usage:
    sogou_corpus_conv.py corpus_in_xml > contents_in_txt
"""

try:
    file = codecs.open(sys.argv[1], "r", "GB18030" )
except:
    print usage
    exit(1)

for line in file:
    if line.startswith(""):
        start, end = len(""), -len("")-1
        line = line[start:end].replace(u'\ue525', '')
        print line.encode("UTF-8")

With the extracted contents, you could continue to build the SunPinyin SLM.

A simple python utility for updating and merging Java properties

Gnu gettext is a set of tools to manipulate the po files, e.g., msgattribute, msgmerge etc. When I am doing localization for Java program these days, I did not find similar tools to do such kind of things. Then I had to write a little python script to deal with that, -- proputil.py.

$ ./proputil.py --help

Usage:

    proputil merge eng.properties loc1.properties loc2.properties
        merge loc1.propeties and loc2.properties, output to standard output, 
        NOTE: loc1.properties has higher piority!

    proputil update eng_orig.properties eng_new.properties
        report the updated and newly added properties from eng_orig.properties 
        to eng_new.properties

    proputil status eng.properties loc.properties
        report the localization status of loc.properties

    proputil wordcount test.properties
        report the wordcount of test.properties
    

For a more sophisticated property parser, you may refer to "ASPN Python Cookbook: A python replacement for java.util.Properties".

Record module for python by pyrex

----pyrecord.pyx----
cdef extern from "X11/X.h":
    ctypedef unsigned long Time

    # declare the #define constants, don't have to provide their initial values
    enum:
        KeyPress=2
        KeyRelease
        ButtonPress
        ButtonRelease
        MotionNotify
        FocusIn
        FocusOut

cdef extern from "X11/Xproto.h":
    # only need to declare the members you need to access
    # and no nested ctypedef/cdef supported so far
    ctypedef struct S1:
        unsigned char type
        unsigned char detail

    ctypedef struct S2:
        unsigned short rootX
        unsigned short rootY

    ctypedef union U:
        S1 u
        S2 keyButtonPointer

    ctypedef struct xEvent:
        U u

                
cdef extern from "X11/Xlib.h":
    enum Bool:
        False=0
        True=1

    ctypedef void *XPointer
    ctypedef int Status

    # opaque struct which you only use its pointer type
    ctypedef struct Display

    Display *XOpenDisplay(char* display_name)
    int XCloseDisplay(Display *display)
    int XFlush(Display* display)
    int XFree (void *data)

    int (*XSynchronize(Display *display, Bool onoff)) ()
   
    ctypedef unsigned int XID
    ctypedef XID KeySym
    ctypedef unsigned char KeyCode

    KeyCode XKeysymToKeycode(Display* display, KeySym keysym)
    KeySym XStringToKeysym(char* string)
    KeySym XKeycodeToKeysym(Display *display, KeyCode keycode, int index)
    char *XKeysymToString(KeySym keysym)

cdef extern from "X11/extensions/record.h":
    enum:
        XRecordCurrentClients=1
        XRecordFutureClients
        XRecordAllClients

    enum:
        XRecordFromServer=0
        XRecordFromClient

    ctypedef struct XRecordRange8:
        unsigned char       first
        unsigned char       last

    ctypedef struct XRecordRange:
        XRecordRange8     device_events

    ctypedef struct XRecordInterceptData:
        Time                server_time
        int                 category
        unsigned char       *data

    ctypedef unsigned long XRecordClientSpec
    ctypedef unsigned long XRecordContext

    XRecordRange *XRecordAllocRange()
   
    XRecordContext XRecordCreateContext(Display*, int, XRecordClientSpec*, int, XRecordRange**, int)

    # typedef function pointer
    ctypedef void (*XRecordInterceptProc) (XPointer, XRecordInterceptData*)   
    Status XRecordEnableContextAsync(Display*, XRecordContext, XRecordInterceptProc, XPointer)

    void XRecordProcessReplies(Display*)

    Status XRecordDisableContext(Display*, XRecordContext)
    Status XRecordFreeContext(Display*, XRecordContext)
    void XRecordFreeData(XRecordInterceptData*)

cdef void _event_callback_wrapper (XPointer py_fn, XRecordInterceptData *hook):
    cdef xEvent *data

    # casting
    data = <xEvent*> hook.data
   
    if hook.category != XRecordFromServer:
        XRecordFreeData (hook)
        return

    type = data.u.u.type
    event_spec = {'type':type}
   
    if type in (KeyPress, KeyRelease):
        event_spec['keysym'] = XKeysymToString(XKeycodeToKeysym(g_disp, data.u.u.detail, 0))
       
    elif type in (ButtonPress, ButtonRelease):
        event_spec['button'] = data.u.u.detail
       
    elif type == MotionNotify:
        event_spec['rootX'] = data.u.keyButtonPointer.rootX
        event_spec['rootY'] = data.u.keyButtonPointer.rootY

    # delegate to python callback method
    (<object>py_fn) (event_spec)
   
cdef Display *g_disp
cdef class XRecord:
    cdef Display *data_disp
    cdef Display *ctrl_disp

    cdef XRecordRange *rr
    cdef XRecordClientSpec  rcs
    cdef XRecordContext   rc

    # __new__ is also effective for constructor
    def __init__(self):
        self.ctrl_disp = XOpenDisplay (NULL)
        self.data_disp = XOpenDisplay (NULL)

        global g_disp
        g_disp = self.ctrl_disp

        XSynchronize(self.ctrl_disp, True)

        self.rr = XRecordAllocRange ()

        # use '.' instead of '->' to access the data member
        self.rr.device_events.first = KeyPress
        self.rr.device_events.last = MotionNotify
        self.r
cs = XRecordAllClients

        self.rc = XRecordCreateContext (self.ctrl_disp, 0, &(self.rcs), 1, &(self.rr), 1)
       
    def set_callback(self, cb):
        XRecordEnableContextAsync (self.data_disp, self.rc, _event_callback_wrapper, <void*>cb) #casting

    # __del__ does not work for pyrex extension type
    def __dealloc__(self):
        XRecordDisableContext (self.ctrl_disp, self.rc)
        XRecordFreeContext (self.ctrl_disp, self.rc)
        XFree (self.rr)

        XCloseDisplay (self.data_disp)
        XCloseDisplay (self.ctrl_disp)

    def replies(self):
        XRecordProcessReplies (self.data_disp)

----test.py----
from pyrecord import *

stop = 0
def test_cb(event):
    if event['type'] in (2, 3):
        print event['type'], event['keysym']
        if event['keysym'] == 'Escape':
            global stop
            stop = 1
    elif event['type'] in (4, 5):
        print event['type'], event['button']
    elif event['type'] == 6:
        print event['rootX'], event['rootY']

r=XRecord()
r.set_callback(test_cb)

while not stop:
    r.replies()

del r

Wrap a c-library with pyrex and callback as python method

----pysig.pyx----
gcb = None

cdef extern from "signal.h":
        ctypedef void (*sighandler_t)(int)
        sighandler_t signal(int signum, sighandler_t handler)

# signal handler function does not have callback context data, save the python callback in module global variable
cdef void _sighandler_wrapper (int signum):
        gcb (signum)

cdef class SIGNAL:
        def __init__(self, signum):
                signal (signum, _sighandler_wrapper)

        def main_loop (self):
                while (1): pass

        def set_callback (self, cb):
                global gcb
                gcb = cb

----test.py----
import pysig

def test_cb (i):
        print 'test_cb', i

s=pysig.SIGNAL (12)
s.set_callback (test_cb)
s.main_loop ()


$ pyrexc pysig.pyx
$ gcc -c -fPIC pysig.c -I /usr/include/python2.4/
$ gcc --shared pysig.o -o pysig.so
$ python ./test.py

In another terminal
$ pkill -12 python


References: