Skip to content

Commit

Permalink
fix compiler warnings; update README.md with details for running samples
Browse files Browse the repository at this point in the history
  • Loading branch information
danielli90 committed Nov 4, 2015
1 parent 07ca1ac commit 49eea2f
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 43 deletions.
74 changes: 38 additions & 36 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,37 @@ wordCounts.SaveAsTextFile(@"hdfs://path/to/wordcount.txt");
```
A simple DataFrame application using TempTable may look like the following
```c#
var requestDataFrame = sqlContext.TextFile(@"hdfs://path/to/requests.csv");
var metricsDateFrame = sqlContext.TextFile(@"hdfs://path/to/metrics.csv");
requestDataFrame.RegisterTempTable("requests");
metricsDateFrame.RegisterTempTable("metrics");
// C0 - guid in requests DF, C3 - guid in metrics DF
var join = GetSqlContext().Sql(
var reqDataFrame = sqlContext.TextFile(@"hdfs://path/to/requests.csv");
var metricDataFrame = sqlContext.TextFile(@"hdfs://path/to/metrics.csv");
reqDataFrame.RegisterTempTable("requests");
metricDataFrame.RegisterTempTable("metrics");
// C0 - guid in requests DataFrame, C3 - guid in metrics DataFrame
var joinDataFrame = GetSqlContext().Sql(
"SELECT joinedtable.datacenter" +
", MAX(joinedtable.latency) maxlatency" +
", AVG(joinedtable.latency) avglatency " +
"FROM (" +
"SELECT a.C1 as datacenter, b.C6 as latency " +
"FROM requests a JOIN metrics b ON a.C0 = b.C3) joinedtable " +
"GROUP BY datacenter");
join.ShowSchema();
join.Show();
"GROUP BY datacenter");
joinDataFrame.ShowSchema();
joinDataFrame.Show();
```
A simple DataFrame application using DataFrame DSL may look like the following
``` c#
// C0 - guid, C1 - datacenter
var requestDataFrame = sqlContext.TextFile(@"hdfs://path/to/requests.csv")
.Select("C0", "C1");
var reqDataFrame = sqlContext.TextFile(@"hdfs://path/to/requests.csv")
.Select("C0", "C1");
// C3 - guid, C6 - latency
var metricsDateFrame = sqlContext.TextFile(@"hdfs://path/to/metrics.csv", ",", false, true)
.Select("C3", "C6"); //override delimiter, hasHeader & inferSchema
var joinDataFrame = requestDataFrame.Join(metricsDateFrame, requestDataFrame["C0"] == metricsDateFrame["C3"])
.GroupBy("C1");
var metricDataFrame = sqlContext.TextFile(@"hdfs://path/to/metrics.csv", ",", false, true)
.Select("C3", "C6"); //override delimiter, hasHeader & inferSchema
var joinDataFrame = reqDataFrame.Join(metricDataFrame, reqDataFrame["C0"] == metricDataFrame["C3"])
.GroupBy("C1");
var maxLatencyByDcDataFrame = joinDataFrame.Agg(new Dictionary<string, string> { { "C6", "max" } });
maxLatencyByDcDataFrame.ShowSchema();
maxLatencyByDcDataFrame.Show();
```
Refer to SparkCLR\csharp\Samples directory for complete samples
Refer to `SparkCLR\csharp\Samples` directory for complete samples

## Documents
Refer to the docs @ https://github.com/Microsoft/SparkCLR/tree/master/docs
Expand All @@ -54,57 +54,59 @@ Refer to the docs @ https://github.com/Microsoft/SparkCLR/tree/master/docs
* [Nuget command-line utility](https://docs.nuget.org/release-notes) 3.2 and above

### Instructions
* Navigate to SparkCLR\scala directory and run the following command to build spark-clr*.jar
* Navigate to `SparkCLR\scala` directory and run the following command to build spark-clr*.jar
```
mvn package
```
* Start Developer Command Prompt for Visual Studio, navigate to SparkCLR\csharp directory, run the following commands to add nuget.exe to the path
* Start Developer Command Prompt for Visual Studio, navigate to `SparkCLR\csharp` directory, run the following commands to add `nuget.exe` to the path
```
set PATH=<fullpath to nuget.exe>;%PATH%
```
And build the rest of .Net binaries
```
build.cmd
```
* Under SparkCLR\csharp directory, run the following command to clean the .NET binaries built above
* Optional. Under `SparkCLR\csharp` directory, run the following command to clean the .NET binaries built above
```
clean.cmd
```

## Running Samples
### Prerequisites
Set the following environment variables
DataFrame TextFile API uses `spark-csv` package to load data from CSV file. Latest [commons-csv-*.jar](http://commons.apache.org/proper/commons-csv/download_csv.cgi) and [spark-csv*.jar (Scala version:2.10)](http://spark-packages.org/package/databricks/spark-csv) should be downloaded manually.

The following environment variables should be set properly:
* ```JAVA_HOME```
* ```SCALA_HOME```
* ```SPARKCLR_HOME```
* ```SPARKCSV_JARS``` (if sqlContext.TextFile method is used to create DataFrame from csv files)

Directory pointed by ```SPARKCLR_HOME``` should have the following directories and files
* **lib** (spark-clr*.jar)
* **bin** (Microsoft.Spark.CSharp.Adapter.dll, CSharpWorker.exe)
* **scripts** (sparkclr-submit.cmd)
* **samples** (SparkCLRSamples.exe, Microsoft.Spark.CSharp.Adapter.dll, CSharpWorker.exe)
* **data** (all the data files used by samples)
* ```SPARKCSV_JARS``` should include fullpaths to `commons-csv*.jar` and `spark-csv*.jar`. For example:
```
set SPARKCSV_JARS=%SPARKCLR_HOME%\lib\commons-csv-1.2.jar;%SPARKCLR_HOME%\lib\spark-csv_2.10-1.2.0.jar
```
* ```SPARKCLR_HOME``` should point to a directory prapared with following subdirectories:
* **lib** (`spark-clr*.jar`)
* **bin** (`SparkCLR\csharp\Samples\Microsft.Spark.CSharp\bin\[Debug|Release]\*`, including `Microsoft.Spark.CSharp.Adapter.dll`, `CSharpWorker.exe`, `SparkCLRSamples.exe`, `SparkCLRSamples.exe.Config` and etc.)
* **scripts** (`sparkclr-submit.cmd`)
* **data** (`SparkCLR\csharp\Samples\Microsoft.Spark.CSharp\data\*`)
### Running in Local mode
Set ```CSharpWorkerPath``` in SparkCLRSamples.exe.config and run the following. Note that SparkCLR jar version (**1.4.1**) should be aligned with Apache Spark version.
Set `CSharpWorkerPath` in `SparkCLRSamples.exe.config` and run the following. Note that SparkCLR jar version (**1.4.1**) should be aligned with Apache Spark version.
```
sparkclr-submit.cmd --verbose D:\SparkCLRHome\lib\spark-clr-1.4.1-SNAPSHOT.jar D:\SparkCLRHome\SparkCLRSamples.exe spark.local.dir D:\temp\SparkCLRTemp sparkclr.sampledata.loc D:\SparkCLRHome\data
sparkclr-submit.cmd --verbose %SPARKCLR_HOME%\lib\spark-clr-1.4.1-SNAPSHOT.jar %SPARKCLR_HOME%\bin\SparkCLRSamples.exe spark.local.dir C:\temp\SparkCLRTemp sparkclr.sampledata.loc %SPARKCLR_HOME%\data
```
Setting spark.local.dir parameter is optional and it is useful if local setup of Spark uses %TEMP% directory in windows to which adding SparkCLR driver exe file may cause problems (AV programs might automatically delete executables placed in these directories)
Setting `spark.local.dir` parameter is important. When local Spark instance distributes SparkCLR driver executables to Windows `%TEMP%` directory, anti-virus software may detect and report the executables showed up in `%TEMP%` directory as malware.
### Running in Standalone cluster mode
```
sparkclr-submit.cmd --verbose D:\SparkCLRHome\lib\spark-clr-1.4.1-SNAPSHOT.jar D:\SparkCLRHome\SparkCLRSamples.exe sparkclr.sampledata.loc hdfs://path/to/sparkclr/sampledata
sparkclr-submit.cmd --verbose %SPARKCLR_HOME%\lib\spark-clr-1.4.1-SNAPSHOT.jar %SPARKCLR_HOME%\bin\SparkCLRSamples.exe sparkclr.sampledata.loc hdfs://path/to/sparkclr/sampledata
```
### Running in YARN mode
To be added
## Running Unit Tests
* In Visual Studio: "Test" -> "Run" -> "All Tests"
* In Developer Command Prompt for VS, navigate to SparkCLR\csharp and run the following command
* In Developer Command Prompt for VS, navigate to `SparkCLR\csharp` and run the following command
```
test.cmd
```
Expand All @@ -113,8 +115,8 @@ To be added
CSharpBackend and C# driver are separately launched for debugging SparkCLR Adapter or driver
For example, to debug SparkCLR samples
* Launch CSharpBackend using ```sparkclr-submit.cmd debug``` and get the port number displayed in the console
* Navigate to csharp/Samples/Microsoft.Spark.CSharp and edit App.Config to use the port number from the previous step for CSharpBackendPortNumber config and also set CSharpWorkerPath config
* Run SparkCLRSamples.exe in Visual Studio
* Navigate to `csharp/Samples/Microsoft.Spark.CSharp` and edit `App.Config` to use the port number from the previous step for CSharpBackendPortNumber config and also set CSharpWorkerPath config
* Run `SparkCLRSamples.exe` in Visual Studio
## License
SparkCLR is licensed under the MIT license. See LICENSE file in the project root for full license information.
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ namespace Microsoft.Spark.CSharp.Configuration
/// to be used in SparkCLR runtime
/// </summary>
internal class ConfigurationService : IConfigurationService
{
{
private ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(ConfigurationService));
private SparkCLRConfiguration configuration;
private RunMode runMode; //not used anywhere for now but may come handy in the future
private RunMode runMode = RunMode.UNKNOWN; //not used anywhere for now but may come handy in the future

public int BackendPortNumber
{
Expand Down Expand Up @@ -50,14 +51,19 @@ internal ConfigurationService()
configuration = new SparkCLRConfiguration(appConfig);
runMode = RunMode.CLUSTER;
}
else if (sparkMaster.StartsWith("yarn"))
{
else if (sparkMaster.StartsWith("yarn"))
{
runMode = RunMode.YARN;
throw new NotSupportedException("YARN is not currently supported");
}
else
{
throw new NotSupportedException(string.Format("Spark master value {0} not reconginzed", sparkMaster));
throw new NotSupportedException(string.Format("Spark master value {0} not recognized", sparkMaster));
}

// Workaround compiler warning CS0414:
// The variable runMode is assigned not used
logger.LogInfo(string.Format("ConfigurationService runMode is {0}", runMode));
}

public string GetCSharpRDDExternalProcessName()
Expand Down Expand Up @@ -177,6 +183,7 @@ internal override int GetPortNumber()

public enum RunMode
{
UNKNOWN,
DEBUG, //not a Spark mode but exists for dev debugging purpose
LOCAL,
CLUSTER,
Expand Down
25 changes: 24 additions & 1 deletion csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ public static JoinType LeftSemi

public class Column
{
private IColumnProxy columnProxy;
private readonly IColumnProxy columnProxy;

internal IColumnProxy ColumnProxy
{
Expand All @@ -359,6 +359,29 @@ internal Column(IColumnProxy columnProxy)
{
throw new NotImplementedException();
}

/// <summary>
/// Mitigate compiler warning CS0661:
/// 'Microsoft.Spark.CSharp.Sql.Column' defines operator == or operator != but does not override Object.GetHashCode()
/// </summary>
/// <returns></returns>
public override int GetHashCode()
{
return (columnProxy != null ? columnProxy.GetHashCode() : 0);
}

/// <summary>
/// Mitigate compiler warning CS0660:
/// 'Microsoft.Spark.CSharp.Sql.Column' defines operator == or operator != but does not override Object.Equals(object o)
/// </summary>
/// <returns></returns>
public override bool Equals(object obj)
{
if (ReferenceEquals(null, obj)) return false;
if (ReferenceEquals(this, obj)) return true;
return obj.GetType() == this.GetType() && Equals(this.columnProxy, ((Column)obj).columnProxy);
}

}

public class GroupedData
Expand Down
2 changes: 1 addition & 1 deletion scripts/sparkclr-submit.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if "%SPARK_ASSEMBLY_JAR%"=="0" (
)

set SPARKCLR_JAR=spark-clr-1.4.1-SNAPSHOT.jar
set SPARKCLR_CLASSPATH=%SPARKCLR_HOME%\lib\%SPARKCLR_JAR%
set SPARKCLR_CLASSPATH=%SPARKCLR_HOME%\lib\%SPARKCLR_JAR%;%SPARKCSV_JARS%
set LAUNCH_CLASSPATH=%SPARK_ASSEMBLY_JAR%;%SPARKCLR_CLASSPATH%
set SPARKCLR_SUBMIT_CLASS=org.apache.spark.deploy.csharp.SparkCLRSubmit
set SPARK_SUBMIT_CLASS=org.apache.spark.deploy.SparkSubmit
Expand Down

0 comments on commit 49eea2f

Please sign in to comment.