SQL INSERT pero evita duplicados

Quiero hacer algunas inserciones rápidas, pero evito los duplicados en una tabla. Por el bien de los argumentos, llamemos MarketPrices, he estado experimentando con dos formas de hacerlo, pero no estoy seguro de cómo comparar qué será más rápido.SQL INSERT pero evita duplicados

INSERT INTO MarketPrices (SecurityCode, BuyPrice, SellPrice, IsMarketOpen) 
SELECT @SecurityCode, @BuyPrice, @SellPrice, @IsMarketOpen 
EXCEPT 
SELECT SecurityCode, BuyPrice, SellPrice, j.bool as IsActive FROM MarketPrices 
CROSS JOIN (SELECT 0 as bool UNION SELECT 1 as bool) as j

DECLARE @MktId int 
SET @MktId = (SELECT SecurityId FROM MarketPrices 
       where SecurityCode = @SecurityCode 
       and [email protected] 
       and SellPrice = @SellPrice) 

IF (@MktId is NULL) 
BEGIN 
    INSERT INTO MarketPrices (SecurityCode, BuyPrice, SellPrice, IsMarketOpen) 
    VALUES 
    (@SecurityCode,@BuyPrice, @SellPrice, @IsMarketOpen) 
END

Suponga que @whatever es un parámetro de entrada en el procedimiento almacenado.

Quiero poder insertar un nuevo registro para cada Código de seguridad cuando el Precio de compra o el Precio de venta o ambos son diferentes de cualquier otra ocurrencia anterior. No me importa IsMarketOpen.

¿Hay algo absolutamente estúpido sobre cualquiera de los enfoques anteriores? ¿Es uno más rápido que el otro?

Fuente

2009-11-06 Ravi

Recuerde que el segundo enfoque debe incluirse en una transacción, otra sabio, podrías tener problemas de concurrencia. –

¿no puedes simplemente crear un índice único? no tengo experiencia en ms sql pero creo que debería haber tales indeces –

@valya: Es curioso cómo las personas dudan de que SQL Server pueda hacer incluso las cosas más simples. Ni siquiera estoy seguro de si puede implementar un motor de base de datos relacional * sin * que soporte índices únicos. – Tomalak

EDITAR: para prevenir race conditions en entornos concurrentes, utilice WITH (UPDLOCK) en la subconsulta correlacionada o EXCEPT 'd SELECT. El script de prueba que escribí a continuación no lo requiere, ya que usa tablas temporales que solo son visibles para la conexión actual, pero en un entorno real, que opera en contra de tablas de usuarios, sería necesario.

MERGE no requiere UPDLOCK.

Inspirado por mcl de respuesta re: índice único & vamos a la base de datos emite un error, decidí referencia conditional inserts vs try/catch.

Los resultados parecen apoyar la inserción condicional sobre try/catch, pero YMMV. Es un escenario muy simple (una columna, una pequeña mesa, etc), ejecutado en una máquina, etc.

Éstos son los resultados (SQL Server 2008, construir 10.0.1600.2):

duplicates (short table)  
    try/catch:    14440 milliseconds/100000 inserts 
    conditional insert:  2983 milliseconds/100000 inserts 
    except:     2966 milliseconds/100000 inserts 
    merge:      2983 milliseconds/100000 inserts 

uniques 
    try/catch:     3920 milliseconds/100000 inserts 
    conditional insert:  3860 milliseconds/100000 inserts 
    except:     3873 milliseconds/100000 inserts 
    merge:      3890 milliseconds/100000 inserts 

    straight insert:   3173 milliseconds/100000 inserts 

duplicates (tall table) 
    try/catch:    14436 milliseconds/100000 inserts 
    conditional insert:  3063 milliseconds/100000 inserts 
    except:     3063 milliseconds/100000 inserts 
    merge:      3030 milliseconds/100000 inserts

Aviso, incluso en inserciones únicas, hay ligeramente más sobrecarga para intentar/atrapar que una inserción condicional. Me pregunto si esto varía según la versión, la CPU, el número de núcleos, etc.

No comparé las inserciones condicionales IF, solo WHERE. Supongo que la variedad IF mostraría más sobrecarga, ya que a) tendría dos instrucciones, yb) tendría que ajustar las dos declaraciones en una transacción y establecer el nivel de aislamiento en serializable (!). Si alguien quería para probar esto, necesitaría cambiar la tabla temporal a una tabla de usuario normal (serializable no se aplica a las tablas temporales locales).

Aquí está la secuencia de comandos:

-- tested on SQL 2008. 
-- to run on SQL 2005, comment out the statements using MERGE 
set nocount on 

if object_id('tempdb..#temp') is not null drop table #temp 
create table #temp (col1 int primary key) 
go 

------------------------------------------------------- 

-- duplicate insert test against a table w/ 1 record 

------------------------------------------------------- 

insert #temp values (1) 
go 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    begin try 
    insert #temp select @x 
    end try 
    begin catch end catch 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (short table), try/catch: %i milliseconds/%i inserts',-1,-1,@duration,@y) with nowait 
go 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    insert #temp select @x where not exists (select * from #temp where col1 = @x) 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (short table), conditional insert: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    insert #temp select @x except select col1 from #temp 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (short table), except: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go 

-- comment this batch out for SQL 2005 
declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    merge #temp t using (select @x) s (col1) on t.col1 = s.col1 when not matched by target then insert values (col1); 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (short table), merge: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go 

------------------------------------------------------- 

-- unique insert test against an initially empty table 

------------------------------------------------------- 

truncate table #temp 
declare @x int, @now datetime, @duration int 
select @x = 0, @now = getdate() 
while @x < 100000 begin 
    set @x = @x+1 
    insert #temp select @x 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('uniques, straight insert: %i milliseconds/%i inserts',-1,-1,@duration, @x) with nowait 
go 

truncate table #temp 
declare @x int, @now datetime, @duration int 
select @x = 0, @now = getdate() 
while @x < 100000 begin 
    set @x = @x+1 
    begin try 
    insert #temp select @x 
    end try 
    begin catch end catch 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('uniques, try/catch: %i milliseconds/%i inserts',-1,-1,@duration, @x) with nowait 
go 

truncate table #temp 
declare @x int, @now datetime, @duration int 
select @x = 0, @now = getdate() 
while @x < 100000 begin 
    set @x = @x+1 
    insert #temp select @x where not exists (select * from #temp where col1 = @x) 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('uniques, conditional insert: %i milliseconds/%i inserts',-1,-1,@duration, @x) with nowait 
go 

truncate table #temp 
declare @x int, @now datetime, @duration int 
select @x = 0, @now = getdate() 
while @x < 100000 begin 
    set @x = @x+1 
    insert #temp select @x except select col1 from #temp 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('uniques, except: %i milliseconds/%i inserts',-1,-1,@duration, @x) with nowait 
go 

-- comment this batch out for SQL 2005 
truncate table #temp 
declare @x int, @now datetime, @duration int 
select @x = 1, @now = getdate() 
while @x < 100000 begin 
    set @x = @x+1 
    merge #temp t using (select @x) s (col1) on t.col1 = s.col1 when not matched by target then insert values (col1); 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('uniques, merge: %i milliseconds/%i inserts',-1,-1,@duration, @x) with nowait 
go 

------------------------------------------------------- 

-- duplicate insert test against a table w/ 100000 records 

------------------------------------------------------- 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    begin try 
    insert #temp select @x 
    end try 
    begin catch end catch 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (tall table), try/catch: %i milliseconds/%i inserts',-1,-1,@duration,@y) with nowait 
go 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    insert #temp select @x where not exists (select * from #temp where col1 = @x) 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (tall table), conditional insert: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    insert #temp select @x except select col1 from #temp 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (tall table), except: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go 

-- comment this batch out for SQL 2005 
declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    merge #temp t using (select @x) s (col1) on t.col1 = s.col1 when not matched by target then insert values (col1); 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (tall table), merge: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go

Fuente

2009-11-06 17:31:51

La razón principal para usar un índice único aquí es para garantizar la integridad de los datos. Sospecho que una inserción fallida en un bloque try/catch no resultará ser un cuello de botella en la mayoría de las aplicaciones, especialmente en el escenario donde no hay muchos intentos de insertar un duplicado (ya que su benchmark muestra un rendimiento similar en ese caso). Pero sospecho que tener un modelo de datos no implementado va a causar un problema en algún momento. Además, en SQL Server 2008, sugeriría explorar el uso de MERGE sobre cualquiera de estas otras estrategias. – mlibby

@mcl re: índice único, estoy completamente de acuerdo, debe tener un índice de integridad de datos, y necesitará uno si quiere un rendimiento razonable. re: MERGE, acabo de probarlo, y funciona * muy * de forma similar a una inserción condicional en todos los escenarios. –

Gracias chicos, ojalá pudiera aceptar sus dos respuestas. Voy a poner un índice único para la integridad de los datos y luego usar la inserción condicional porque parece ser la mejor en términos de rendimiento y legibilidad. – Ravi

EDITAR: para evitar race conditions en un entorno concurrente, utilizar WITH (UPDLOCK) en la subconsulta correlacionada.

creo que este sería el método estándar:

INSERT INTO MarketPrices (SecurityCode, BuyPrice, SellPrice, IsMarketOpen) 
SELECT @SecurityCode, @BuyPrice, @SellPrice, @IsMarketOpen 
WHERE NOT EXISTS (
    SELECT * FROM MarketPrices WITH (UPDLOCK) 
    WHERE SecurityCode = @SecurityCode 
    AND BuyPrice = @BuyPrice 
    AND SellPrice = @SellPrice 
)

Si alguno de sus campos son anulable, tendría que añadir que a la condición.

Su primer método es interesante, pero los requisitos EXCEPTO le hacen saltar por los aro. Este método es esencialmente el mismo, pero te ayuda a resolver el problema de coincidencia de columnas.

alternativa:

INSERT INTO MarketPrices (SecurityCode, BuyPrice, SellPrice, IsMarketOpen) 
SELECT SecurityCode, BuyPrice, SellPrice, @IsMarketOpen 
FROM (
    SELECT @SecurityCode, @BuyPrice, @SellPrice 
    EXCEPT 
    SELECT SecurityCode, BuyPrice, SellPrice FROM MarketPrices WITH (UPDLOCK) 
) a (SecurityCode, BuyPrice, SellPrice)

Lo bueno, excepto en este caso es que se ocupa de los nulos y sin ningún tipo de codificación adicional por su parte. Para lograr lo mismo en el primer ejemplo, necesitaría probar cada par para NULL así como para igualdad, long-hand.

Su segundo método está bien, pero no necesita la variable. Ver la solución de Tomalak, la limpió muy bien. Además, necesitaría manejar explícitamente la posibilidad de insertos concurrentes, si eso fuera una preocupación.

Fuente

2009-11-06 16:33:15

Me gustaría una solución semántica en cualquier momento. Sus dos propuestas me parecen bastante oscuras (aunque esta última es mejor que la anterior).

IF NOT EXISTS (
    SELECT 1 
    FROM MarketPrices 
    WHERE SecurityCode = @SecurityCode 
     AND BuyPrice = @BuyPrice 
     AND SellPrice = @SellPrice 
) 
BEGIN 
    INSERT MarketPrices 
    (SecurityCode, BuyPrice, SellPrice, IsMarketOpen) 
    VALUES 
    (@SecurityCode, @BuyPrice, @SellPrice, @IsMarketOpen) 
END

Con un índice de conglomerado sobre SecurityCode, BuyPrice, SellPrice la consulta EXISTS debe ir bastante rápido.

Benchmarking es una cuestión de temporizar un ciclo WHILE, diría yo. Pruébalo y compruébalo por ti mismo.

Fuente

2009-11-06 16:33:58 Tomalak

Otra opción: crear un índice único en los campos (SecurityCode, BuyPrice, SellPrice) en cuestión, emitir un simple inserto y dejar que la base de datos decida si los registros están duplicados. La inserción fallará en un intento de insertar un duplicado.

El uso de código (ya sea en lenguaje externo o SQL proc) para garantizar la exclusividad no es lo suficientemente estricto y, en última instancia, dará lugar a los mismos duplicados que espera evitar.

Fuente

2009-11-06 16:50:15 mlibby

Estoy pensando que podría estar en lo cierto, especialmente cuando se trata de insertos simultáneos – Ravi

Me gustaría ver este punto de referencia. ¿Asumiendo un índice único, que tiene más sobrecarga: la cláusula WHERE de una inserción condicional, o el manejo de excepción de un bloque TRY/CATCH? Si espera que el 99% de sus insertos * * no sean duplicados, me imagino que el bloque TRY/CATCH podría ser más eficiente. –

Voy a hacer exactamente eso cuando llegue a casa - publicaré los resultados aquí – Ravi

si no es necesario duplicados trampa, siempre se puede crear un índice único con "ignorar duplicados" se define como true. SQL Server se encargará de esto por ti.

Fuente

2010-12-11 08:26:51 IamIC

A continuación he agregado las respuestas principales de Only inserting a row if it's not already there a la excelente respuesta de Peter Radocchia.

La conclusión es que el uso de la técnica de race safe with try/catch es marginal (~ 1%) más rápido que race safe with updlock, holdlock técnica cuando no hay colisiones reales (es decir, se puede esperar que las colisiones serán muy raros - este es el escenario uniques), y es una un poco más lento (~ 20%) cuando siempre hay colisiones (este es el escenario duplicates). Esto no está tomando en cuenta cuestiones complejas como la escalada de bloqueo.

Estos son los resultados (SQL Server 2014, compilación 12.0.2000.8):() sección de la tabla corta

duplicates (short table)  
    try/catch:      15546 milliseconds/100000 inserts 
    conditional insert:    1460 milliseconds/100000 inserts 
    except:       1490 milliseconds/100000 inserts 
    merge:       1420 milliseconds/100000 inserts 
    race safe with try/catch:   1650 milliseconds/100000 inserts 
    race safe with updlock, holdlock: 1330 milliseconds/100000 inserts 

uniques 
    try/catch:      2266 milliseconds/100000 inserts 
    conditional insert:    2156 milliseconds/100000 inserts 
    except:       2273 milliseconds/100000 inserts 
    merge:       2136 milliseconds/100000 inserts 
    race safe with try/catch:   2400 milliseconds/100000 inserts 
    race safe with updlock, holdlock: 2430 milliseconds/100000 inserts 

    straight insert:     1686 milliseconds/100000 inserts 

duplicates (tall table) 
    try/catch:      15826 milliseconds/100000 inserts 
    conditional insert:    1530 milliseconds/100000 inserts 
    except:       1506 milliseconds/100000 inserts 
    merge:       1443 milliseconds/100000 inserts 
    race safe with try/catch:   1636 milliseconds/100000 inserts 
    race safe with updlock, holdlock: 1426 milliseconds/100000 inserts

Duplicados: sección Únicas

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    begin try 
    insert #temp select @x where not exists (select * from #temp where col1 = @x) 
    end try 
    begin catch 
    if error_number() <> 2627 
     throw 
    end catch 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (short table), race safe with try/catch: %i milliseconds/%i inserts',-1,-1,@duration,@y) with nowait 
go 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    insert #temp select @x where not exists (select * from #temp with (updlock, holdlock) where col1 = @x) 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (short table), race safe with updlock, holdlock: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go

truncate table #temp 
declare @x int, @now datetime, @duration int 
select @x = 0, @now = getdate() 
while @x < 100000 begin 
    set @x = @x+1 
    begin try 
    insert #temp select @x where not exists (select * from #temp where col1 = @x) 
    end try 
    begin catch 
    if error_number() <> 2627 
     throw 
    end catch 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('uniques, race safe with try/catch: %i milliseconds/%i inserts',-1,-1,@duration, @x) with nowait 
go 

truncate table #temp 
declare @x int, @now datetime, @duration int 
select @x = 0, @now = getdate() 
while @x < 100000 begin 
    set @x = @x+1 
    insert #temp select @x where not exists (select * from #temp with (updlock, holdlock) where col1 = @x) 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('uniques, race safe with updlock, holdlock: %i milliseconds/%i inserts',-1,-1,@duration, @x) with nowait 
go

Duplicados (mesa alta) sección

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    begin try 
    insert #temp select @x where not exists (select * from #temp where col1 = @x) 
    end try 
    begin catch 
    if error_number() <> 2627 
     throw 
    end catch 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (tall table), race safe with try/catch: %i milliseconds/%i inserts',-1,-1,@duration,@y) with nowait 
go 

declare @x int, @y int, @now datetime, @duration int 
select @x = 1, @y = 0, @now = getdate() 
while @y < 100000 begin 
    set @y = @y+1 
    insert #temp select @x where not exists (select * from #temp with (updlock, holdlock) where col1 = @x) 
end 
set @duration = datediff(ms,@now,getdate()) 
raiserror('duplicates (tall table), race safe with updlock, holdlock: %i milliseconds/%i inserts',-1,-1,@duration, @y) with nowait 
go

Fuente

2015-05-18 20:35:29

Respuesta

Cuestiones relacionadas