Split a list into nested lists on a value

Tags:

Say I have a list like so:

[1, 4, None, 6, 9, None, 3, 9, 4]

I decide to split this into nested lists on None, to get this:

[[1, 4], [6, 9], [3, 9, 4]]

Of course, I could have wanted to do this on (9, None) in which case, we would have got:

[[1, 4], [6], [3], [4]]

This is trivial to do using list append through iteration (in a for loop)

I am interested to know whether this can be done in something faster - like a list comprehension?

If not, why not? (for example, a list comprehension cannot return more than one list element per iteration?)

294

asked Dec 01 '10 09:12

PoorLuzer

2 Answers

>>> def isplit(iterable,splitters):     return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]  >>> isplit(L,(None,)) [[1, 4], [6, 9], [3, 9, 4]] >>> isplit(L,(None,9)) [[1, 4], [6], [3], [4]]

benchmark code:

import timeit      kabie=("isplit_kabie", """ import itertools def isplit_kabie(iterable,splitters):     return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k] """ )  ssplit=("ssplit", """ def ssplit(seq,splitters):     seq=list(seq)     if splitters and seq:         result=[]         begin=0         for end in range(len(seq)):             if seq[end] in splitters:                 if end > begin:                     result.append(seq[begin:end])                 begin=end+1         if begin<len(seq):             result.append(seq[begin:])         return result     return [seq] """ )  ssplit2=("ssplit2", """ def ssplit2(seq,splitters):     seq=list(seq)     if splitters and seq:         splitters=set(splitters).intersection(seq)         if splitters:             result=[]             begin=0             for end in range(len(seq)):                 if seq[end] in splitters:                     if end > begin:                         result.append(seq[begin:end])                     begin=end+1             if begin<len(seq):                 result.append(seq[begin:])             return result     return [seq] """ )  emile=("magicsplit", """ def _itersplit(l, *splitters):     current = []     for item in l:         if item in splitters:             yield current             current = []         else:             current.append(item)     yield current  def magicsplit(l, splitters):     return [subl for subl in _itersplit(l, *splitters) if subl] """ )  emile_improved=("magicsplit2", """ def _itersplit(l, *splitters):     current = []     for item in l:         if item in splitters:             if current:                 yield current                 current = []         else:             current.append(item)     if current:         yield current  def magicsplit2(l, splitters):     if splitters and l:         return [i for i in _itersplit(l, *splitters)]     return [list(l)] """ )  karl=("ssplit_karl", """ def ssplit_karl(original,splitters):     indices = [i for (i, x) in enumerate(original) if x in splitters]     ends = indices + [len(original)]     begins = [0] + [x + 1 for x in indices]     return [original[begin:end] for (begin, end) in zip(begins, ends)] """ )  ryan=("split_on", """ from functools import reduce def split_on (seq, delims, remove_empty=True):     '''Split seq into lists using delims as a delimiting elements.      For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ].      delims can be either a single delimiting element or a list or     tuple of multiple delimiting elements. If you wish to use a list     or tuple as a delimiter, you must enclose it in another list or     tuple.      If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist'''     delims=set(delims)     def reduce_fun(lists, elem):         if elem in delims:             if remove_empty and lists[-1] == []:                 # Avoid adding multiple empty lists                 pass             else:                 lists.append([])         else:             lists[-1].append(elem)         return lists     result_list = reduce(reduce_fun, seq, [ [], ])     # Maybe remove trailing empty list     if remove_empty and result_list[-1] == []:         result_list.pop()     return result_list """ )  cases=(kabie, emile, emile_improved, ssplit ,ssplit2 ,ryan)  data=(     ([1, 4, None, 6, 9, None, 3, 9, 4 ],(None,)),     ([1, 4, None, 6, 9, None, 3, 9, 4 ]*5,{None,9,7}),     ((),()),     (range(1000),()),     ("Split me",('','')),     ("split me "*100,' '),     ("split me,"*100,' ,'*20),     ("split me, please!"*100,' ,!'),     (range(100),range(100)),     (range(100),range(101,1000)),     (range(100),range(50,150)),     (list(range(100))*30,(99,)),     )  params="seq,splitters"  def benchmark(func,code,data,params='',times=10000,rounds=3,debug=''):     assert(func.isidentifier())     tester = timeit.Timer(stmt='{func}({params})'.format(                                 func=func,params=params),                           setup="{code}\n".format(code=code)+             (params and "{params}={data}\n".format(params=params,data=data)) +             (debug and """ret=repr({func}({params})) print({func}.__name__.rjust(16),":",ret[:30]+"..."+ret[-15:] if len(ret)>50 else ret)                        """.format(func=func,params=params)))     results = [tester.timeit(times) for i in range(rounds)]     if not debug:         print("{:>16s} takes:{:6.4f},avg:{:.2e},best:{:.4f},worst:{:.4f}".format(             func,sum(results),sum(results)/times/rounds,min(results),max(results)))  def testAll(cases,data,params='',times=10000,rounds=3,debug=''):     if debug:         times,rounds = 1,1     for dat in data:         sdat = tuple(map(repr,dat))         print("{}x{} times:".format(times,rounds),               ','.join("{}".format(d[:8]+"..."+d[-5:] if len(d)>16 else d)for d in map(repr,dat)))         for func,code in cases:             benchmark(func,code,dat,params,times,rounds,debug)  if __name__=='__main__':     testAll(cases,data,params,500,10)#,debug=True)

Output on i3-530, Windows7, Python 3.1.2:

500x10 times: [1, 4, N...9, 4],(None,)     isplit_kabie takes:0.0605,avg:1.21e-05,best:0.0032,worst:0.0074       magicsplit takes:0.0287,avg:5.74e-06,best:0.0016,worst:0.0036      magicsplit2 takes:0.0174,avg:3.49e-06,best:0.0017,worst:0.0018           ssplit takes:0.0149,avg:2.99e-06,best:0.0015,worst:0.0016          ssplit2 takes:0.0198,avg:3.96e-06,best:0.0019,worst:0.0021         split_on takes:0.0229,avg:4.59e-06,best:0.0023,worst:0.0024 500x10 times: [1, 4, N...9, 4],{9, None, 7}     isplit_kabie takes:0.1448,avg:2.90e-05,best:0.0144,worst:0.0146       magicsplit takes:0.0636,avg:1.27e-05,best:0.0063,worst:0.0065      magicsplit2 takes:0.0891,avg:1.78e-05,best:0.0064,worst:0.0162           ssplit takes:0.0593,avg:1.19e-05,best:0.0058,worst:0.0061          ssplit2 takes:0.1004,avg:2.01e-05,best:0.0069,worst:0.0142         split_on takes:0.0929,avg:1.86e-05,best:0.0090,worst:0.0096 500x10 times: (),()     isplit_kabie takes:0.0041,avg:8.14e-07,best:0.0004,worst:0.0004       magicsplit takes:0.0040,avg:8.04e-07,best:0.0004,worst:0.0004      magicsplit2 takes:0.0022,avg:4.35e-07,best:0.0002,worst:0.0002           ssplit takes:0.0023,avg:4.59e-07,best:0.0002,worst:0.0003          ssplit2 takes:0.0023,avg:4.53e-07,best:0.0002,worst:0.0002         split_on takes:0.0072,avg:1.45e-06,best:0.0007,worst:0.0009 500x10 times: range(0, 1000),()     isplit_kabie takes:0.8892,avg:1.78e-04,best:0.0881,worst:0.0895       magicsplit takes:0.6614,avg:1.32e-04,best:0.0654,worst:0.0673      magicsplit2 takes:0.0958,avg:1.92e-05,best:0.0094,worst:0.0099           ssplit takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0095          ssplit2 takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0096         split_on takes:1.3348,avg:2.67e-04,best:0.1328,worst:0.1340 500x10 times: 'Split me',('', '')     isplit_kabie takes:0.0234,avg:4.68e-06,best:0.0023,worst:0.0024       magicsplit takes:0.0126,avg:2.52e-06,best:0.0012,worst:0.0013      magicsplit2 takes:0.0138,avg:2.76e-06,best:0.0013,worst:0.0015           ssplit takes:0.0119,avg:2.39e-06,best:0.0012,worst:0.0012          ssplit2 takes:0.0075,avg:1.50e-06,best:0.0007,worst:0.0008         split_on takes:0.0191,avg:3.83e-06,best:0.0018,worst:0.0023 500x10 times: 'split m... me ',' '     isplit_kabie takes:2.0803,avg:4.16e-04,best:0.2060,worst:0.2098       magicsplit takes:0.9219,avg:1.84e-04,best:0.0920,worst:0.0925      magicsplit2 takes:1.0221,avg:2.04e-04,best:0.1018,worst:0.1034           ssplit takes:0.8294,avg:1.66e-04,best:0.0818,worst:0.0834          ssplit2 takes:0.9911,avg:1.98e-04,best:0.0983,worst:0.1014         split_on takes:1.5672,avg:3.13e-04,best:0.1543,worst:0.1694 500x10 times: 'split m... me,',' , , , ... , ,'     isplit_kabie takes:2.1847,avg:4.37e-04,best:0.2164,worst:0.2275       magicsplit takes:3.7135,avg:7.43e-04,best:0.3693,worst:0.3783      magicsplit2 takes:3.8104,avg:7.62e-04,best:0.3795,worst:0.3884           ssplit takes:0.9522,avg:1.90e-04,best:0.0939,worst:0.0956          ssplit2 takes:1.0140,avg:2.03e-04,best:0.1009,worst:0.1023         split_on takes:1.5747,avg:3.15e-04,best:0.1563,worst:0.1615 500x10 times: 'split m...ase!',' ,!'     isplit_kabie takes:3.3443,avg:6.69e-04,best:0.3324,worst:0.3380       magicsplit takes:2.0594,avg:4.12e-04,best:0.2054,worst:0.2076      magicsplit2 takes:2.1850,avg:4.37e-04,best:0.2180,worst:0.2191           ssplit takes:1.4881,avg:2.98e-04,best:0.1484,worst:0.1493          ssplit2 takes:1.8779,avg:3.76e-04,best:0.1868,worst:0.1920         split_on takes:2.9596,avg:5.92e-04,best:0.2946,worst:0.2980 500x10 times: range(0, 100),range(0, 100)     isplit_kabie takes:0.9445,avg:1.89e-04,best:0.0933,worst:0.1023       magicsplit takes:0.5878,avg:1.18e-04,best:0.0583,worst:0.0593      magicsplit2 takes:0.5597,avg:1.12e-04,best:0.0554,worst:0.0588           ssplit takes:0.8568,avg:1.71e-04,best:0.0852,worst:0.0874          ssplit2 takes:0.1399,avg:2.80e-05,best:0.0121,worst:0.0242         split_on takes:0.1462,avg:2.92e-05,best:0.0145,worst:0.0148 500x10 times: range(0, 100),range(101, 1000)     isplit_kabie takes:19.9749,avg:3.99e-03,best:1.9789,worst:2.0330       magicsplit takes:9.4997,avg:1.90e-03,best:0.9369,worst:0.9640      magicsplit2 takes:9.4394,avg:1.89e-03,best:0.9267,worst:0.9665           ssplit takes:19.2363,avg:3.85e-03,best:1.8936,worst:1.9516          ssplit2 takes:0.2032,avg:4.06e-05,best:0.0201,worst:0.0205         split_on takes:0.3329,avg:6.66e-05,best:0.0323,worst:0.0344 500x10 times: range(0, 100),range(50, 150)     isplit_kabie takes:1.1394,avg:2.28e-04,best:0.1130,worst:0.1153       magicsplit takes:0.7288,avg:1.46e-04,best:0.0721,worst:0.0760      magicsplit2 takes:0.7220,avg:1.44e-04,best:0.0705,worst:0.0774           ssplit takes:1.0835,avg:2.17e-04,best:0.1059,worst:0.1116          ssplit2 takes:0.1092,avg:2.18e-05,best:0.0105,worst:0.0116         split_on takes:0.1639,avg:3.28e-05,best:0.0162,worst:0.0168 500x10 times: [0, 1, 2..., 99],(99,)     isplit_kabie takes:3.2579,avg:6.52e-04,best:0.3225,worst:0.3360       magicsplit takes:2.2937,avg:4.59e-04,best:0.2274,worst:0.2344      magicsplit2 takes:2.6054,avg:5.21e-04,best:0.2587,worst:0.2642           ssplit takes:1.5251,avg:3.05e-04,best:0.1495,worst:0.1729          ssplit2 takes:1.7298,avg:3.46e-04,best:0.1696,worst:0.1858         split_on takes:4.1041,avg:8.21e-04,best:0.4033,worst:0.4291

Slightly modified Ryan's code, hope you don't mind. ssplit was based on the idea of Karl. Added statements handling some special cases to became ssplit2 which is the best solution I may provide.

answered Oct 14 '22 19:10

Kabie

Something like this:

def _itersplit(l, splitters):     current = []     for item in l:         if item in splitters:             yield current             current = []         else:             current.append(item)     yield current  def magicsplit(l, *splitters):     return [subl for subl in _itersplit(l, splitters) if subl]

gets me:

>>> l = [1, 4, None, 6, 9, None, 3, 9, 4 ] >>> magicsplit(l, None) [[1, 4], [6, 9], [3, 9, 4]] >>> magicsplit(l, None, 9) [[1, 4], [6], [3], [4]]

answered Oct 14 '22 19:10

Emile

Related questions
                            
                                How do you convert a Javascript timestamp into UTC format?
                            
                                Close jQuery UI Dialog from Iframe
                            
                                how to release the caching which is used by Mongodb?
                            
                                Extract Data from .PDF files
                            
                                Django - Get uploaded file type / mimetype
                            
                                How to use HtmlUnit in Java to select an element from a drop down box?
                            
                                Ignore Exception in C#
                            
                                Windows' limit on the number of files in a particular folder [closed]
                            
                                test if an MSBuild property is defined?
                            
                                Calculating and printing factorial at compile time in C++
                            
                                Where should I prefer pass-by-reference or pass-by-value?
                            
                                How to set -Dorg.apache.el.parser.COERCE_TO_ZERO=false programmatically

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With