I'm trying to submit a MapReduce job to HDInsight cluster. In my job I didn't write reduce portion because I don't want to reduce anything. All I want to do is to parse the each filename and append the values to every line in the file. So that I will have all the data needed inside the file.
My code is
using Microsoft.Hadoop.MapReduce;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace GetMetaDataFromFileName
{
class Program
{
static void Main(string[] args)
{
var hadoop = connectAzure();
//Temp Workaround to Env Variables
Environment.SetEnvironmentVariable("HADOOP_HOME", @"c:\hadoop");
Environment.SetEnvironmentVariable("Java_HOME", @"c:\hadoop\jvm");
var result = hadoop.MapReduceJob.ExecuteJob<MetaDataGetterJob>();
}
static IHadoop connectAzure()
{
//TODO: Update credentials and other information
return Hadoop.Connect(
new Uri("https://sampleclustername.azurehdinsight.net//"),
"admin",
"Hadoop",
"password",
"blobstoragename.blob.core.windows.net", //Storage Account that Log files exists
"AccessKeySample", //Storage Account Access Key
"logs", //Container Name
true
);
}
//Hadoop Mapper
public class MetaDataGetter : MapperBase
{
public override void Map(string inputLine, MapperContext context)
{
try
{
//Get the meta data from name of the file
string[] _fileMetaData = context.InputFilename.Split('_');
string _PublicIP = _fileMetaData[0].Trim();
string _PhysicalAdapterMAC = _fileMetaData[1].Trim();
string _BootID = _fileMetaData[2].Trim();
string _ServerUploadTime = _fileMetaData[3].Trim();
string _LogType = _fileMetaData[4].Trim();
string _MachineUpTime = _fileMetaData[5].Trim();
//Generate CSV portion
string _RowHeader = string.Format("{0},{1},{2},{3},{4},{5},", _PublicIP, _PhysicalAdapterMAC, _BootID, _ServerUploadTime, _LogType, _MachineUpTime);
//TODO: Append _RowHeader to every row in the file.
context.EmitLine(_RowHeader + inputLine);
}
catch(ArgumentException ex)
{
return;
}
}
}
//Hadoop Job Definition
public class MetaDataGetterJob : HadoopJob<MetaDataGetter>
{
public override HadoopJobConfiguration Configure(ExecutorContext context)
{
//Initiate the job config
HadoopJobConfiguration config = new HadoopJobConfiguration();
config.InputPath = "asv://[email protected]/Input";
config.OutputFolder = "asv://[email protected]/Output";
config.DeleteOutputFolder = true;
return config;
}
}
}
}
Usually what do you thing the reason of 500 (Server Error) ? Am I suppling to wrong credentials ? Actually I didn't really understand the difference between Username and HadoopUser parameters in Hadoop.Connect method ?
Thank you,
I had approximately same issue in the past (was unable to submit hive job to the cluster with BadGateway response). I have contacted the support team and in my case the problem was in memory leakage at the head node, what means that the problem was not at client's side and it seems to be inherited hadoop problem.
I've solved that stuff by redeploying the cluster. Have you tried to submit other jobs (simple ones)? If so, than I suggest to have a contact with azure support team or just redeploy the cluster if it's not painful for you.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With