Sunday, October 22, 2017

Azure Data Factory Version 2.0

There are lots of improvements in Azure Data factory  in Version 2.0,  which is still in preview, even UI is not available but here are few steps in Power-Shell to create and configure Azure data factory for version 2.0.

Below example creates a pipeline in azure data factory v2.0 via power shell. For that it moves data from one SQL table to another SQL table.


Prerequisites
  1. Create 2 tables in your SQL server, need to move data from ADFSync1 to ADFSync2 table

CREATE TABLE [dbo].[ADFSync1](
[ADFSyncId] [bigint] IDENTITY(1,1) NOT NULL,
[PipelineName] [nvarchar](50) NOT NULL,
[ActivityName] [nvarchar](50) NOT NULL,
[SourceDB] [nvarchar](50) NOT NULL,
[DestinationDB] [nvarchar](50) NOT NULL,
[Description] [nvarchar](255) NOT NULL,
[StartDateTime] [datetime] NULL,
[EndDateTime] [datetime] NULL,
[NoOfRecords] [int] NOT NULL,
[Status] [nvarchar](50) NOT NULL,
[LastExecutionDate] [datetime] NOT NULL,
 CONSTRAINT [PK_ADFSync1] PRIMARY KEY CLUSTERED
(
[ADFSyncId] ASC
)WITH (STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF)
)
GO

Also insert few records in ADFSync1

Few Variables for PowerShell
 $ResourceGroupName='ADFDemo'
 $DataFactoryName='ADFV2-Test'
 $Location='EAST US 2'


  • Create data Factory
First of all login to Azure subscription
 #********** Login to Azure account **********

Login-AzureRmAccount
Select-AzureRmSubscription -SubscriptionName '<<Your Azure Subscription Name>>'

$dataFactory = Set-AzureRmDataFactoryV2 -ResourceGroupName $ResourceGroupName -Name $DataFactoryName -Location $Location -ErrorAction Stop

Once you create this , you can see DATA factory is created in your Azure subscription




  • Configure/ Create Source and destination connection
We need 2 Json files for source and destination, for this example copy and paste below text in 2 separate json files with name "Source-DB" and "Destination-DB".


{
    "name": "Source-DB",
    "properties": {
        "type": "AzureSqlDatabase",
        "description": "",
      "typeProperties": {
        "connectionString": {
          "value": "Data Source=<your server name>.database.windows.net;Initial Catalog=<your DBName>;User ID=DBUSerID;Password=<<ur password>>;Integrated Security=False;Encrypt=True;Connect Timeout=30",
          "type": "SecureString"
        }
      }
    }
}

Now based on above json create 2 linked services

Set-AzureRmDataFactoryV2LinkedService  -DataFactoryName $DataFactoryName -Name 'source-DB' -ResourceGroupName $ResourceGroupName -DefinitionFile 'source-DB.json' -Force -ErrorAction Stop

Set-AzureRmDataFactoryV2LinkedService  -DataFactoryName $DataFactoryName -Name 'destination-DB' -ResourceGroupName $ResourceGroupName -DefinitionFile 'destination-DB.json' -Force -ErrorAction Stop

  • Configure/create dataset
Now need to create 2 datasets, one for input and another for output
Json file for both is below
Dataset-Input-ADFSync
{
  "name": "Dataset-Input-ADFSync",
  "properties": {
    "structure": [
      {
        "name": "PipelineName",
        "type": "String"
      },
      {
        "name": "ActivityName",
        "type": "String"
      },
      {
        "name": "SourceDB",
        "type": "String"
      },
      {
        "name": "DestinationDB",
        "type": "String"
      },
      {
        "name": "Description",
        "type": "String"
      },
      {
        "name": "StartDateTime",
        "type": "Datetime"
      },
      {
        "name": "EndDateTime",
        "type": "Datetime"
      },
      {
        "name": "NoOfRecords",
        "type": "Int64"
      },
      {
        "name": "Status",
        "type": "String"
      },
      {
        "name": "LastExecutionDate",
        "type": "Datetime"
      }
    ],
    "type": "AzureSqlTable",
    "linkedServiceName": {
      "referenceName": "Source-DB",
      "type": "LinkedServiceReference"
    },
    "typeProperties": {
      "tableName": "[dbo].[ADFSync1]"
    }
  }
}

Dataset-Output-ADFSync
{
    "name": "Dataset-Output-ADFSync",
  "properties": {
    "structure": [
      {
        "name": "PipelineName",
        "type": "String"
      },
      {
        "name": "ActivityName",
        "type": "String"
      },
      {
        "name": "SourceDB",
        "type": "String"
      },
      {
        "name": "DestinationDB",
        "type": "String"
      },
      {
        "name": "Description",
        "type": "String"
      },
      {
        "name": "StartDateTime",
        "type": "Datetime"
      },
      {
        "name": "EndDateTime",
        "type": "Datetime"
      },
      {
        "name": "NoOfRecords",
        "type": "Int64"
      },
      {
        "name": "Status",
        "type": "String"
      },
      {
        "name": "LastExecutionDate",
        "type": "Datetime"
      }
    ],
    "type": "AzureSqlTable",
    "linkedServiceName": {
      "referenceName": "Destination-DB",
      "type": "LinkedServiceReference"
    },
    "typeProperties": {
      "tableName": "[dbo].[ADFSync2]"
    }
   
  }
}

Now in PowerShell need to execute following commands.
  Set-AzureRmDataFactoryV2Dataset -DataFactoryName $DataFactoryName -Name 'Dataset-Input-ADFSync' -ResourceGroupName $ResourceGroupName -DefinitionFile 'Dataset-Input-ADFSync.json' -Force -ErrorAction Stop

  Set-AzureRmDataFactoryV2Dataset -DataFactoryName $DataFactoryName -Name 'Dataset-Output-ADFSync'-ResourceGroupName $ResourceGroupName -DefinitionFile 'Dataset-Output-ADFSync.json' -Force -ErrorAction Stop

  • Create Pipeline
Now create json for pipeline and add that via powershell, this pipeline should have one activity.

{
    "name": "Pipeline-99",
  "properties": {
    "description": "Pipeline to Sync ADFSync Table",
      "activities": [
        {
          "type": "Copy",
          "typeProperties": {
            "source": {
              "type": "SqlSource",
              "sqlReaderQuery": "select [PipelineName],[ActivityName],[SourceDB],[DestinationDB],[Description],[StartDateTime],[EndDateTime],[NoOfRecords],[Status],[LastExecutionDate] from [dbo].[ADFSync1]"
            },
            "sink": {
              "type": "SqlSink",
              "sqlWriterCleanupScript": "$$Text.Format('delete from [dbo].[ADFSync2]')",
              "writeBatchSize": 0,
              "writeBatchTimeout": "00:00:00"
            },
            "translator": {
              "type": "TabularTranslator",
              "columnMappings": "PipelineName:PipelineName,ActivityName:ActivityName,SourceDB:SourceDB,DestinationDB:DestinationDB,Description:Description,StartDateTime:StartDateTime,EndDateTime:EndDateTime,NoOfRecords:NoOfRecords,Status:Status,LastExecutionDate:LastExecutionDate"
            }
          },
          "inputs": [
            {
              "referenceName": "Dataset-Input-ADFSync",
              "type": "DatasetReference"
            }
          ],
          "outputs": [
            {
              "referenceName": "Dataset-Output-ADFSync",
              "type": "DatasetReference"
            }
          ],
          "policy": {
            "timeout": "1.00:00:00",
            "concurrency": 1,
            "executionPriorityOrder": "NewestFirst",
            "style": "StartOfInterval",
            "retry": 3,
            "longRetry": 0,
            "longRetryInterval": "00:00:00"
          },
          "name": "Activity-ADFSync"
        }
      ]

    }
  }

Powershell command to add this pipeline.

SET-AzureRmDataFactoryV2Pipeline -DataFactoryName $DataFactoryName -Name 'Pipeline-99' -ResourceGroupName $ResourceGroupName -DefinitionFile 'Pipeline-99.json' -Force -ErrorAction Stop

  • Trigger Pipeline
Now create json (Trigger-Pipelines.json) for pipeline and add that via powershell, this pipeline should have one activity.
{
  "properties": {
    "type": "ScheduleTrigger",
    "typeProperties": {
      "recurrence": {
        "frequency": "Minute",
        //"<<Minute",        "Hour",        "Day",        "Week",        "Year>>",
        "interval": "5", // optional, how often to fire (default to 1)
        "startTime": "2017-10-09",
        "endTime": "2099-10-09",
        "timeZone": "UTC"
            },
      "pipelines": [
        {
          "pipelineReference": {
            "type": "PipelineReference",
            "referenceName": "Pipeline-99"
          },
          "parameters": {
            "Param1": {
              "type": "String",
              "Value": "abc"
            },
            "Param2": {
              "type": "String",
              "Value": "abc"
            }
          }
        }
      ]
    }
  }
  }

Powershell to trigger the pipeline

$runId = Invoke-AzureRmDataFactoryV2Pipeline -DataFactoryName $dataFactoryName -ResourceGroupName $resourceGroupName -PipelineName "Pipeline-99" -ParameterFile 'Trigger-Pipelines.json' -ErrorAction Stop


 Write-Host "Activity run details:" -foregroundcolor "Yellow"
    $result = Get-AzureRmDataFactoryV2ActivityRun -DataFactoryName $dataFactoryName -ResourceGroupName $resourceGroupName -PipelineRunId $runId -RunStartedAfter (Get-Date).AddMinutes(-3) -RunStartedBefore (Get-Date).AddMinutes(3)

    $result