2010-12-01 37 views
22

Decir que tengo una lista de este modo:Dividir una lista en listas anidadas en un valor

[1, 4, None, 6, 9, None, 3, 9, 4 ] 

decido dividir esto en las listas anidadas en None, para conseguir esto:

[ [ 1, 4 ], [ 6, 9 ], [ 3, 9, 4 ] ] 

De por supuesto, podría haber querido hacer esto en (9, None) en cuyo caso, habríamos conseguido:

[ [ 1, 4 ], [ 6 ], [ 3 ], [ 4 ] ] 

esta es t rivial para hacer usando lista anexar a través de la iteración (en un ciclo for)

Estoy interesado en saber si esto se puede hacer en algo más rápido, como una lista de comprensión?

Si no es así, ¿por qué no (por ejemplo, una lista por comprensión no puede devolver más de un elemento de la lista por iteración?)

+0

: -D. Nada que ver con eso realmente.Me acabo de hacer esta pregunta y no pude pensar en algo rápido. – PoorLuzer

Respuesta

32
>>> def isplit(iterable,splitters): 
    return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k] 

>>> isplit(L,(None,)) 
[[1, 4], [6, 9], [3, 9, 4]] 
>>> isplit(L,(None,9)) 
[[1, 4], [6], [3], [4]] 

el código de prueba:

import timeit  

kabie=("isplit_kabie", 
""" 
import itertools 
def isplit_kabie(iterable,splitters): 
    return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k] 
""") 

ssplit=("ssplit", 
""" 
def ssplit(seq,splitters): 
    seq=list(seq) 
    if splitters and seq: 
     result=[] 
     begin=0 
     for end in range(len(seq)): 
      if seq[end] in splitters: 
       if end > begin: 
        result.append(seq[begin:end]) 
       begin=end+1 
     if begin<len(seq): 
      result.append(seq[begin:]) 
     return result 
    return [seq] 
""") 

ssplit2=("ssplit2", 
""" 
def ssplit2(seq,splitters): 
    seq=list(seq) 
    if splitters and seq: 
     splitters=set(splitters).intersection(seq) 
     if splitters: 
      result=[] 
      begin=0 
      for end in range(len(seq)): 
       if seq[end] in splitters: 
        if end > begin: 
         result.append(seq[begin:end]) 
        begin=end+1 
      if begin<len(seq): 
       result.append(seq[begin:]) 
      return result 
    return [seq] 
""") 

emile=("magicsplit", 
""" 
def _itersplit(l, *splitters): 
    current = [] 
    for item in l: 
     if item in splitters: 
      yield current 
      current = [] 
     else: 
      current.append(item) 
    yield current 

def magicsplit(l, splitters): 
    return [subl for subl in _itersplit(l, *splitters) if subl] 
""") 

emile_improved=("magicsplit2", 
""" 
def _itersplit(l, *splitters): 
    current = [] 
    for item in l: 
     if item in splitters: 
      if current: 
       yield current 
       current = [] 
     else: 
      current.append(item) 
    if current: 
     yield current 

def magicsplit2(l, splitters): 
    if splitters and l: 
     return [i for i in _itersplit(l, *splitters)] 
    return [list(l)] 
""") 

karl=("ssplit_karl", 
""" 
def ssplit_karl(original,splitters): 
    indices = [i for (i, x) in enumerate(original) if x in splitters] 
    ends = indices + [len(original)] 
    begins = [0] + [x + 1 for x in indices] 
    return [original[begin:end] for (begin, end) in zip(begins, ends)] 
""") 

ryan=("split_on", 
""" 
from functools import reduce 
def split_on (seq, delims, remove_empty=True): 
    '''Split seq into lists using delims as a delimiting elements. 

    For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ]. 

    delims can be either a single delimiting element or a list or 
    tuple of multiple delimiting elements. If you wish to use a list 
    or tuple as a delimiter, you must enclose it in another list or 
    tuple. 

    If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist''' 
    delims=set(delims) 
    def reduce_fun(lists, elem): 
     if elem in delims: 
      if remove_empty and lists[-1] == []: 
       # Avoid adding multiple empty lists 
       pass 
      else: 
       lists.append([]) 
     else: 
      lists[-1].append(elem) 
     return lists 
    result_list = reduce(reduce_fun, seq, [ [], ]) 
    # Maybe remove trailing empty list 
    if remove_empty and result_list[-1] == []: 
     result_list.pop() 
    return result_list 
""") 

cases=(kabie, emile, emile_improved, ssplit ,ssplit2 ,ryan) 

data=(
    ([1, 4, None, 6, 9, None, 3, 9, 4 ],(None,)), 
    ([1, 4, None, 6, 9, None, 3, 9, 4 ]*5,{None,9,7}), 
    ((),()), 
    (range(1000),()), 
    ("Split me",('','')), 
    ("split me "*100,' '), 
    ("split me,"*100,' ,'*20), 
    ("split me, please!"*100,' ,!'), 
    (range(100),range(100)), 
    (range(100),range(101,1000)), 
    (range(100),range(50,150)), 
    (list(range(100))*30,(99,)), 
    ) 

params="seq,splitters" 

def benchmark(func,code,data,params='',times=10000,rounds=3,debug=''): 
    assert(func.isidentifier()) 
    tester = timeit.Timer(stmt='{func}({params})'.format(
           func=func,params=params), 
          setup="{code}\n".format(code=code)+ 
      (params and "{params}={data}\n".format(params=params,data=data)) + 
      (debug and """ret=repr({func}({params})) 
print({func}.__name__.rjust(16),":",ret[:30]+"..."+ret[-15:] if len(ret)>50 else ret) 
         """.format(func=func,params=params))) 
    results = [tester.timeit(times) for i in range(rounds)] 
    if not debug: 
     print("{:>16s} takes:{:6.4f},avg:{:.2e},best:{:.4f},worst:{:.4f}".format(
      func,sum(results),sum(results)/times/rounds,min(results),max(results))) 

def testAll(cases,data,params='',times=10000,rounds=3,debug=''): 
    if debug: 
     times,rounds = 1,1 
    for dat in data: 
     sdat = tuple(map(repr,dat)) 
     print("{}x{} times:".format(times,rounds), 
       ','.join("{}".format(d[:8]+"..."+d[-5:] if len(d)>16 else d)for d in map(repr,dat))) 
     for func,code in cases: 
      benchmark(func,code,dat,params,times,rounds,debug) 

if __name__=='__main__': 
    testAll(cases,data,params,500,10)#,debug=True) 

de salida en i3-530, Windows7, Python 3.1.2:

500x10 times: [1, 4, N...9, 4],(None,) 
    isplit_kabie takes:0.0605,avg:1.21e-05,best:0.0032,worst:0.0074 
     magicsplit takes:0.0287,avg:5.74e-06,best:0.0016,worst:0.0036 
    magicsplit2 takes:0.0174,avg:3.49e-06,best:0.0017,worst:0.0018 
      ssplit takes:0.0149,avg:2.99e-06,best:0.0015,worst:0.0016 
     ssplit2 takes:0.0198,avg:3.96e-06,best:0.0019,worst:0.0021 
     split_on takes:0.0229,avg:4.59e-06,best:0.0023,worst:0.0024 
500x10 times: [1, 4, N...9, 4],{9, None, 7} 
    isplit_kabie takes:0.1448,avg:2.90e-05,best:0.0144,worst:0.0146 
     magicsplit takes:0.0636,avg:1.27e-05,best:0.0063,worst:0.0065 
    magicsplit2 takes:0.0891,avg:1.78e-05,best:0.0064,worst:0.0162 
      ssplit takes:0.0593,avg:1.19e-05,best:0.0058,worst:0.0061 
     ssplit2 takes:0.1004,avg:2.01e-05,best:0.0069,worst:0.0142 
     split_on takes:0.0929,avg:1.86e-05,best:0.0090,worst:0.0096 
500x10 times:(),() 
    isplit_kabie takes:0.0041,avg:8.14e-07,best:0.0004,worst:0.0004 
     magicsplit takes:0.0040,avg:8.04e-07,best:0.0004,worst:0.0004 
    magicsplit2 takes:0.0022,avg:4.35e-07,best:0.0002,worst:0.0002 
      ssplit takes:0.0023,avg:4.59e-07,best:0.0002,worst:0.0003 
     ssplit2 takes:0.0023,avg:4.53e-07,best:0.0002,worst:0.0002 
     split_on takes:0.0072,avg:1.45e-06,best:0.0007,worst:0.0009 
500x10 times: range(0, 1000),() 
    isplit_kabie takes:0.8892,avg:1.78e-04,best:0.0881,worst:0.0895 
     magicsplit takes:0.6614,avg:1.32e-04,best:0.0654,worst:0.0673 
    magicsplit2 takes:0.0958,avg:1.92e-05,best:0.0094,worst:0.0099 
      ssplit takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0095 
     ssplit2 takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0096 
     split_on takes:1.3348,avg:2.67e-04,best:0.1328,worst:0.1340 
500x10 times: 'Split me',('', '') 
    isplit_kabie takes:0.0234,avg:4.68e-06,best:0.0023,worst:0.0024 
     magicsplit takes:0.0126,avg:2.52e-06,best:0.0012,worst:0.0013 
    magicsplit2 takes:0.0138,avg:2.76e-06,best:0.0013,worst:0.0015 
      ssplit takes:0.0119,avg:2.39e-06,best:0.0012,worst:0.0012 
     ssplit2 takes:0.0075,avg:1.50e-06,best:0.0007,worst:0.0008 
     split_on takes:0.0191,avg:3.83e-06,best:0.0018,worst:0.0023 
500x10 times: 'split m... me ',' ' 
    isplit_kabie takes:2.0803,avg:4.16e-04,best:0.2060,worst:0.2098 
     magicsplit takes:0.9219,avg:1.84e-04,best:0.0920,worst:0.0925 
    magicsplit2 takes:1.0221,avg:2.04e-04,best:0.1018,worst:0.1034 
      ssplit takes:0.8294,avg:1.66e-04,best:0.0818,worst:0.0834 
     ssplit2 takes:0.9911,avg:1.98e-04,best:0.0983,worst:0.1014 
     split_on takes:1.5672,avg:3.13e-04,best:0.1543,worst:0.1694 
500x10 times: 'split m... me,',' , , , ... , ,' 
    isplit_kabie takes:2.1847,avg:4.37e-04,best:0.2164,worst:0.2275 
     magicsplit takes:3.7135,avg:7.43e-04,best:0.3693,worst:0.3783 
    magicsplit2 takes:3.8104,avg:7.62e-04,best:0.3795,worst:0.3884 
      ssplit takes:0.9522,avg:1.90e-04,best:0.0939,worst:0.0956 
     ssplit2 takes:1.0140,avg:2.03e-04,best:0.1009,worst:0.1023 
     split_on takes:1.5747,avg:3.15e-04,best:0.1563,worst:0.1615 
500x10 times: 'split m...ase!',' ,!' 
    isplit_kabie takes:3.3443,avg:6.69e-04,best:0.3324,worst:0.3380 
     magicsplit takes:2.0594,avg:4.12e-04,best:0.2054,worst:0.2076 
    magicsplit2 takes:2.1850,avg:4.37e-04,best:0.2180,worst:0.2191 
      ssplit takes:1.4881,avg:2.98e-04,best:0.1484,worst:0.1493 
     ssplit2 takes:1.8779,avg:3.76e-04,best:0.1868,worst:0.1920 
     split_on takes:2.9596,avg:5.92e-04,best:0.2946,worst:0.2980 
500x10 times: range(0, 100),range(0, 100) 
    isplit_kabie takes:0.9445,avg:1.89e-04,best:0.0933,worst:0.1023 
     magicsplit takes:0.5878,avg:1.18e-04,best:0.0583,worst:0.0593 
    magicsplit2 takes:0.5597,avg:1.12e-04,best:0.0554,worst:0.0588 
      ssplit takes:0.8568,avg:1.71e-04,best:0.0852,worst:0.0874 
     ssplit2 takes:0.1399,avg:2.80e-05,best:0.0121,worst:0.0242 
     split_on takes:0.1462,avg:2.92e-05,best:0.0145,worst:0.0148 
500x10 times: range(0, 100),range(101, 1000) 
    isplit_kabie takes:19.9749,avg:3.99e-03,best:1.9789,worst:2.0330 
     magicsplit takes:9.4997,avg:1.90e-03,best:0.9369,worst:0.9640 
    magicsplit2 takes:9.4394,avg:1.89e-03,best:0.9267,worst:0.9665 
      ssplit takes:19.2363,avg:3.85e-03,best:1.8936,worst:1.9516 
     ssplit2 takes:0.2032,avg:4.06e-05,best:0.0201,worst:0.0205 
     split_on takes:0.3329,avg:6.66e-05,best:0.0323,worst:0.0344 
500x10 times: range(0, 100),range(50, 150) 
    isplit_kabie takes:1.1394,avg:2.28e-04,best:0.1130,worst:0.1153 
     magicsplit takes:0.7288,avg:1.46e-04,best:0.0721,worst:0.0760 
    magicsplit2 takes:0.7220,avg:1.44e-04,best:0.0705,worst:0.0774 
      ssplit takes:1.0835,avg:2.17e-04,best:0.1059,worst:0.1116 
     ssplit2 takes:0.1092,avg:2.18e-05,best:0.0105,worst:0.0116 
     split_on takes:0.1639,avg:3.28e-05,best:0.0162,worst:0.0168 
500x10 times: [0, 1, 2..., 99],(99,) 
    isplit_kabie takes:3.2579,avg:6.52e-04,best:0.3225,worst:0.3360 
     magicsplit takes:2.2937,avg:4.59e-04,best:0.2274,worst:0.2344 
    magicsplit2 takes:2.6054,avg:5.21e-04,best:0.2587,worst:0.2642 
      ssplit takes:1.5251,avg:3.05e-04,best:0.1495,worst:0.1729 
     ssplit2 takes:1.7298,avg:3.46e-04,best:0.1696,worst:0.1858 
     split_on takes:4.1041,avg:8.21e-04,best:0.4033,worst:0.4291 

ligeramente modificada código de Ryan, espero que no te importe. ssplit se basó en la idea de Karl. Las declaraciones agregadas que manejan algunos casos especiales se convierten en ssplit2, que es la mejor solución que puedo proporcionar.

+1

+1 por elegancia. –

+0

¡eso es bueno! +1 – mshsayem

+0

Podría haber hecho una pregunta demasiado simple. ¡Tener dificultades para marcar una respuesta! Todos estan bien. No estoy seguro, pero su solución "parece" ser la más rápida * y * correcta. El segundo en línea sería, en ese caso, Emile y luego Karl. ¿Podría, si es posible, comparar estos fragmentos? – PoorLuzer

2

se puede encontrar los índices de los elementos "delimitador". Por ejemplo, los índices de Ninguno son 2 y 5 (basados ​​en cero). Puede usar estos, junto con la longitud de la lista, para construir una lista de tuplas [ (0,1), (3,4), (6,8) ] que representan los índices de inicio y los índices finales de las sublistas. Luego puede usar una lista de comprensión sobre esta lista de tuplas para extraer sublistas.

5

Algo como esto:

def _itersplit(l, splitters): 
    current = [] 
    for item in l: 
     if item in splitters: 
      yield current 
      current = [] 
     else: 
      current.append(item) 
    yield current 

def magicsplit(l, *splitters): 
    return [subl for subl in _itersplit(l, splitters) if subl] 

me pone:

>>> l = [1, 4, None, 6, 9, None, 3, 9, 4 ] 
>>> magicsplit(l, None) 
[[1, 4], [6, 9], [3, 9, 4]] 
>>> magicsplit(l, None, 9) 
[[1, 4], [6], [3], [4]] 
+0

¡Bien pensado Emile! – PoorLuzer

+0

Debo decir que esta es la mejor solución debido a su extraordinario rendimiento. 1000 veces Rango dividido (0, 100) con rango (101, 1000) isplit_kabie toma: 3.4607, toma magicsplit: 0.0167. – Kabie

+0

@Kabie: Me gustaría ver su código de evaluación comparativa. Este es un generador, así que ** debes tener cuidado ** de que todo lo que hiciste en los 0.0167 de magicsplit no fue solo tener un montón de generadores. Los otros, incl. tu devuelve una lista. – PoorLuzer

1

El problema con tratar de utilizar una lista por comprensión de esto es que la comprensión es inherentemente sin estado, y que necesita el estado para llevar a cabo la operación de división. En particular, debe recordar, de un elemento al siguiente, qué elementos se encontraron después del marcador de "división" anterior.

Sin embargo, puede usar una lista de comprensión para extraer los índices de los elementos divididos, y luego otra para usar esos índices para cortar la lista. Necesitamos traducir los índices divididos en índices (inicio, fin) para las 'piezas' necesarias. Lo que haremos será transformar la lista de índices divididos en dos listas separadas de 'comienza' y 'finaliza', y luego los comprimiremos juntos.

El conjunto cosa es así:

splitters = (9, None) # for example 
indices = [i for (i, x) in enumerate(original) if x in splitters] 
ends = indices + [len(original)] 
begins = [0] + [x + 1 for x in indices] 
result = [original[begin:end] for (begin, end) in zip(begins, ends)] 
1

Aquí es una implementación usando reducir, con un par de campanas y silbatos:

#!/usr/bin/env python 

def split_on (delims, seq, remove_empty=True): 
    '''Split seq into lists using delims as a delimiting elements. 

    For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ]. 

    delims can be either a single delimiting element or a list or 
    tuple of multiple delimiting elements. If you wish to use a list 
    or tuple as a delimiter, you must enclose it in another list or 
    tuple. 

    If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist''' 
    if type(delims) not in (type(list()), type(tuple())): 
     delims = (delims,) 
    def reduce_fun(lists, elem): 
     if elem in delims: 
      if remove_empty and lists[-1] == []: 
       # Avoid adding multiple empty lists 
       pass 
      else: 
       lists.append([]) 
     else: 
      lists[-1].append(elem) 
     return lists 
    result_list = reduce(reduce_fun, seq, [ [], ]) 
    # Maybe remove trailing empty list 
    if remove_empty and result_list[-1] == []: 
     result_list.pop() 
    return result_list 

mylist = [1, 4, None, 6, 9, None, 3, 9, 4 ] 

print(split_on(None, mylist)) 
print(split_on((9, None), mylist)) 
Cuestiones relacionadas