Often in my job I have to create a procedure inside SQL Server that will process millions of data rows, save them into a temp table (staging table) and finally save them into a table (s) in a database(s).
I am not looking into alternative solutions, like SSIS.
I cannot disable the indexes,constraints, take DB offline, change the recovery mode, etc.
we have looked for setting this process to run when the systems are less busy, but we work on a 24/7/365 online retailer environment.
there is a very similar question: Performance Inserting and Updating Millions of rows into a table
This question is also relevant: What is the fastest way to insert large numbers of rows?
example one:
CREATE PROCEDURE [dbo].[udpstaging_page_import_fromFilter]
@sourceDesc nvarchar(50) -- e.g. 'Coremetrics'
,@feedDesc nvarchar(50) -- e.g. 'Daily Exports'
,@process_job_task_logID bigint
AS BEGIN
SET NOCOUNT ON;
BEGIN TRY
--truncate table prior INSERT
exec dbo.udpstaging_page_truncateTable;
declare @source_feedID int;
exec crm_admin.dbo.udpsource_feedID_select @sourceDesc
,@feedDesc
,@source_feedID = @source_feedID OUTPUT;
-- drop temp tables
if exists (select * from tempdb.dbo.sysobjects o where o.xtype in ('U') and o.id = object_id(N'tempdb..#pageImport'))
drop table #pageImport;
-- create temp tables
create table #pageImport(
pageImportID [bigint] identity(1,1) NOT NULL
,pageCode [varbinary](16) NOT NULL
);
insert into #pageImport(
pageCode
)
select pageCode
from Coremetrics.PageView
group by pageCode;
-- add indexes to temp table
CREATE CLUSTERED INDEX IDX_pageImport_pageImportID ON #pageImport(pageImportID);
CREATE INDEX IDX_pageImport_pageCode ON #pageImport(pageCode);
declare @recordCount bigint
,@counter int
,@maxCounter int
,@updateRowCount int;
select @counter = MIN(pageImportID)
,@recordCount = MAX(pageImportID)
from #pageImport;
set @updateRowCount = 1000000;
while @counter <= @recordCount
begin
set @maxCounter = (@counter + @updateRowCount - 1);
with pageImport as (
select pv.pageCode
,pv.websiteCode as 'pageCIV'
,dbo.udfDerivePageName(pv.PAGE_ID, pv.CONTENT_CATEGORY_ID) as 'pageName'
,dbo.udfDerivePageName(pv.PAGE_ID, pv.CONTENT_CATEGORY_ID) as 'pageDesc'
,pv.[TIMESTAMP] as 'pageCreateDate'
,pv.pageTypeCode
,'' as 'pageTypeCIV'
,pv.websiteCode
,pv.marketID
,@source_feedID as 'source_feedID'
,@process_job_task_logID as 'process_job_task_logID'
,GETDATE() as 'createdDate'
,SUSER_NAME() as 'createdBy'
,GETDATE() as 'modifiedDate'
,SUSER_NAME() as 'modifiedBy'
,ROW_NUMBER() over (
PARTITION BY [pi].pageCode
ORDER BY pv.[TIMESTAMP]
) as 'is_masterPageImport'
from #pageImport [pi]
inner join Coremetrics.PageView pv on pv.pageCode = [pi].pageCode
and [pi].pageImportID between @counter and @maxCounter
)
insert into staging.[page](
pageCode
,pageCIV
,pageName
,pageDesc
,pageCreateDate
,pageTypeCode
,pageTypeCIV
,websiteCode
,marketID
,source_feedID
,process_job_task_logID
,createdDate
,createdBy
,modifiedDate
,modifiedBy
)
select pageCode
,pageCIV
,pageName
,pageDesc
,pageCreateDate
,pageTypeCode
,pageTypeCIV
,websiteCode
,marketID
,source_feedID
,process_job_task_logID
,createdDate
,createdBy
,modifiedDate
,modifiedBy
from pageImport
where 1 = 1
and is_masterPageImport = 1;
set @counter = @counter + @updateRowCount;
end;
SET NOCOUNT OFF;
RETURN 0;
END TRY
BEGIN CATCH
print N'inner catch: ' + error_message();
SET NOCOUNT OFF;
RETURN -10;
END CATCH
END;
Example Two:
this is just part of a stored procedure that is too big to be posted here.
IF OBJECT_ID('tempdb.dbo.#ztblOrgProductStockView', 'U') IS NOT NULL
DROP TABLE #ztblOrgProductStockView;
CREATE TABLE #ztblOrgProductStockView (
[lngID] [int] NOT NULL IDENTITY PRIMARY KEY,
[sintMarketId] [smallint] NOT NULL,
[sintChannelId] [smallint] NOT NULL,
[strOrgVwName] [varchar](50) COLLATE Latin1_General_CI_AS NOT NULL,
[tintSequence] [tinyint] NOT NULL,
[tintOrgGrpId] [tinyint] NOT NULL,
[strTier1] [varchar](20) COLLATE Latin1_General_CI_AS NOT NULL,
[strTier2] [varchar](20) COLLATE Latin1_General_CI_AS NOT NULL,
[strTier3] [varchar](20) COLLATE Latin1_General_CI_AS NOT NULL,
[strTier4] [varchar](20) COLLATE Latin1_General_CI_AS NOT NULL,
[strTier5] [varchar](20) COLLATE Latin1_General_CI_AS NOT NULL,
[strTier6] [varchar](20) COLLATE Latin1_General_CI_AS NOT NULL,
[strItemNo] [varchar](20) COLLATE Latin1_General_CI_AS NOT NULL,
[strStockTypeName] [varchar](50) COLLATE Latin1_General_CI_AS NOT NULL,
[tintStockTypeId] [tinyint] NOT NULL,
[sintDueWithinDays] [tinyint] NOT NULL,
[bitOverSellingAllowed] [bit] NOT NULL,
[dtmStartDate] [datetime] NULL,
[dtmEndDate] [datetime] NULL,
[dtmExpected] [datetime] NULL,
[blnIsLocalToMarket] [bit] NULL,
[blnPremiunDelvAllowed] [bit] NULL,
[strStdDeliveryDaysCode] [varchar](20)
)
INSERT into #ztblOrgProductStockView (
sintMarketId
,sintChannelId
,strOrgVwName
,tintSequence
,tintOrgGrpId
,strTier1
,strTier2
,strTier3
,strTier4
,strTier5
,strTier6
,strItemNo
,strStockTypeName
,tintStockTypeId
,sintDueWithinDays
,bitOverSellingAllowed
,dtmStartDate
,dtmEndDate
,dtmExpected
,blnIsLocalToMarket
,blnPremiunDelvAllowed
,strStdDeliveryDaysCode
)
select
rv.sintMarketId
,rv.sintChannelId
,rv.strOrgVwName
,tintSequence
,tintOrgGrpId
,ISNULL(rv.pnTier1,'ÿ')
,ISNULL(rv.pnTier2,'ÿ')
,ISNULL(rv.pnTier3,'ÿ')
,ISNULL(rv.strTier4,'ÿ')
,ISNULL(rv.strTier5,'ÿ')
,ISNULL(rv.strTier6,'ÿ')
,rv.strItemNo
,strStockTypeName
,tintStockTypeId
,sintDueWithinDays
,bitOverSellingAllowed
,dtmStartDate
,dtmEndDate
,dtmExpected
,blnIsLocalToMarket
,blnPremiunDelvAllowed
,strStdDeliveryDaysCode
from #ztblOrgProductRangeView_1 rv
inner join #ztblOrgProductSeqView_1 sv on rv.strItemNo = sv.strItemNo
and rv.lngOrgVwId = sv.lngOrgVwId
--order by rv.sintMarketId, rv.sintChannelId, sv.tintOrgGrpId, rv.strItemNo, sv.tintStockTypeId
--set @DebugDate = convert(nvarchar(10),getdate(),108)
--raiserror('%s [%s]', 0, 1, N'Populated #ztblOrgProductStockView', @DebugDate) with nowait
--select [sintMarketId], [sintChannelId], [tintOrgGrpId], [strItemNo], [tintStockTypeId], count(*)
--from [#ztblOrgProductStockView]
--group by [sintMarketId], [sintChannelId], [tintOrgGrpId], [strItemNo], [tintStockTypeId]
--having count(*) > 1
set @lngRowcount = @@ROWCOUNT
set nocount on;
While @lngRowcount > 0
Begin
Set @lngMinID = @lngMaxID
Set @lngMaxID = @lngMaxID + 5000
INSERT INTO [ztblOrgProductStockView]
([sintActiveView]
,[sintMarketId]
,[sintChannelId]
,[strOrgVwName]
,[tintSequence]
,[tintOrgGrpId]
,[strTier1]
,[strTier2]
,[strTier3]
,[strTier4]
,[strTier5]
,[strTier6]
,[strItemNo]
,[strStockTypeName]
,[tintStockTypeId]
,[sintDueWithinDays]
,[bitOverSellingAllowed]
,[dtmStartDate]
,[dtmEndDate]
,[dtmExpected]
,[blnIsLocalToMarket]
,[blnPremiunDelvAllowed]
,[strStdDeliveryDaysCode])
Select
@sintActiveView_new
,[sintMarketId]
,[sintChannelId]
,[strOrgVwName]
,[tintSequence]
,[tintOrgGrpId]
,[strTier1]
,[strTier2]
,[strTier3]
,[strTier4]
,[strTier5]
,[strTier6]
,[strItemNo]
,[strStockTypeName]
,[tintStockTypeId]
,[sintDueWithinDays]
,[bitOverSellingAllowed]
,[dtmStartDate]
,[dtmEndDate]
,[dtmExpected]
,[blnIsLocalToMarket]
,[blnPremiunDelvAllowed]
,[strStdDeliveryDaysCode]
From #ztblOrgProductStockView
Where lngID >= @lngMinID
And lngID < @lngMaxID
set @lngRowcount = @@ROWCOUNT
End
Questions Please note that opinion based answers are not the most popular here, try to give evidence when possible.
1) how to decide the best way to organize the size of the batches? for example on the Example two it is 5000.
2) would it generally have more chances of improving performance if I BEGIN TRANSACTION and COMMIT TRANSACTION within the while loop?
One transaction for batch.
3) In case I would like to change the size of the batch, what I could monitor in order to decide whether I could increase the size of the batch, or I am causing I/O latency?
I currently find the I/O Latency using the script below:
-- How to identify I/O latency issues
-- Below SQL code helps in identifying the I/O latency issues in a SQL Server system on a per-file basis.
-- http://sqlserverdbknowledge.wordpress.com/2011/11/08/how-to-identify-io-latency-issues/
--http://www.sqlskills.com/blogs/paul/how-to-examine-io-subsystem-latencies-from-within-sql-server/
--MARCELO MIORELLI 26-JULY-2013
SELECT
--- virtual file latency
ReadLatency = CASE WHEN num_of_reads = 0
THEN 0 ELSE (io_stall_read_ms / num_of_reads) END,
WriteLatency = CASE WHEN num_of_writes = 0
THEN 0 ELSE (io_stall_write_ms / num_of_writes) END,
Latency = CASE WHEN (num_of_reads = 0 AND num_of_writes = 0)
THEN 0 ELSE (io_stall / (num_of_reads + num_of_writes)) END,
--– avg bytes per IOP
AvgBPerRead = CASE WHEN num_of_reads = 0
THEN 0 ELSE (num_of_bytes_read / num_of_reads) END,
AvgBPerWrite = CASE WHEN io_stall_write_ms = 0
THEN 0 ELSE (num_of_bytes_written / num_of_writes) END,
AvgBPerTransfer = CASE WHEN (num_of_reads = 0 AND num_of_writes = 0)
THEN 0 ELSE ((num_of_bytes_read + num_of_bytes_written) /
(num_of_reads + num_of_writes)) END,
LEFT (mf.physical_name, 2) AS Drive,
DB_NAME (vfs.database_id) AS DB,
--- –vfs.*,
mf.physical_name
FROM sys.dm_io_virtual_file_stats (NULL,NULL) AS vfs
JOIN sys.master_files AS mf
ON vfs.database_id = mf.database_id
AND vfs.file_id = mf.file_id
--WHERE vfs.file_id = 2 — log files
-- ORDER BY Latency DESC
-- ORDER BY ReadLatency DESC
ORDER BY WriteLatency DESC;
GO