Correlation in Rcpp

To install Systematic Investor Toolbox (SIT) please visit About page.

There are many great examples of using Rcpp at Rcpp Gallery. Below, I will use RcppParallel to speed computations of correlation matrix.

#*****************************************************************
# Load historical end of day data
#*****************************************************************
library(SIT)
load.packages('quantmod')

filename = 'big.test.Rdata'

if(!file.exists(filename)) {
  tickers = nasdaq.100.components()
  tickers = sp500.components()$tickers

  data = env()
  getSymbols.extra(tickers, src = 'yahoo', from = '1970-01-01', env = data, set.symbolnames = T, auto.assign = T)
  for(i in data$symbolnames) data[[i]] = adjustOHLC(data[[i]], use.Adjusted=T)
  #print(bt.start.dates(data))

  bt.prep(data, align='keep.all', dates='2000::', fill.gaps=T)

  # remove ones with little history
  bt.prep.remove.symbols.min.history(data, 3*252)

  # show the ones removed
  print(setdiff(tickers,names(data$prices)))

  prices = data$prices
  save(prices, file=filename)
}

load(file=filename)

#*****************************************************************
# Setup
#*****************************************************************
ret = prices / mlag(prices) - 1
  ret = last(ret,252)
  ret = coredata(na.omit(ret))

#*****************************************************************
# Compute Correlation
#*****************************************************************
rtest = function(ret) {
	temp = cor(ret)
	temp[!lower.tri(temp)] = 0
	temp
}

r.cor = rtest(ret)

First let’s implement correlation using Rcpp:

#include <Rcpp.h>
using namespace Rcpp;
using namespace std;

// [[Rcpp::plugins(cpp11)]]

struct asset_info {
	double sum, sum2, stdev;
};

//[correlation matrix](http://en.wikipedia.org/wiki/Correlation_and_dependence).
// n,sX,sY,sXY,sX2,sY2
// cor = ( n * sXY - sX * sY ) / ( sqrt(n * sX2 - sX^2) * sqrt(n * sY2 - sY^2) )
inline asset_info compute_asset_info(const NumericMatrix& mat, 
	const int icol, const int rstart, const int rend) {
	double sum, sum2;
    sum = sum2 = 0;

	for (int r = rstart; r < rend; r++) {
		double d = mat(r, icol);
		sum += d;
		sum2 += pow(d,2);
	}

	asset_info res;
		res.sum = sum;
		res.sum2 = sum2;
		res.stdev = sqrt((rend-rstart) * sum2 - pow(sum, 2));
	return res;
}

inline NumericMatrix c_cor_helper(const NumericMatrix& mat, const int rstart, const int rend) {
	int nc = mat.ncol();
	int nperiod = rend - rstart;
	NumericMatrix rmat(nc, nc);

	vector<asset_info> info(nc);
	for (int c = 0; c < nc; c++)
		info[c] = compute_asset_info(mat, c, rstart, rend);

	for (int c1 = 0; c1 < nc; c1++) {
		for (int c2 = 0; c2 < c1; c2++) {
			double sXY = 0;

			for (int r = rstart; r < rend; r++)
				sXY += mat(r, c1) * mat(r, c2);

			rmat(c1, c2) = (nperiod * sXY - info[c1].sum * info[c2].sum) / (info[c1].stdev * info[c2].stdev);
		}
	}

	return rmat;
}

// [[Rcpp::export]]
NumericMatrix c_cor(NumericMatrix mat) {
	return c_cor_helper(mat, 0, mat.nrow());
}

Please save above code in the correlation.cpp file or download correlation.cpp.

Next let’s make sure it produces same results.

#*****************************************************************
# Rcpp Correlation
#*****************************************************************
# make sure to install [Rtools on windows](http://cran.r-project.org/bin/windows/Rtools/)
load.packages('Rcpp')

# load Rcpp functions
sourceCpp('correlation.cpp')

r.cor = rtest(ret)
c.cor = c_cor(ret)
print(test.equality(r.cor, c.cor))
item1 item2 equal
r.cor c.cor TRUE

The RcppParallel is an easy way to speed up above computations. For more details about RcppParallel I recommend following resources:

Next let’s implement correlation using RcppParallel:

/*------ Parallel version ------*/

// [[Rcpp::depends(RcppParallel)]]
#include <RcppParallel.h>
using namespace RcppParallel;

// pre-compute sum and stdev
struct cor_p1 : public Worker {
	const RMatrix<double> mat;
	const int rstart, rend, nperiod;

	RVector<double> rsum, rstdev;

	cor_p1(const NumericMatrix& mat, const int rstart, const int rend,
		NumericVector rsum, NumericVector rstdev)
		: mat(mat), rstart(rstart), rend(rend), nperiod(rend - rstart), rsum(rsum), rstdev(rstdev) { }

	void operator()(size_t begin, size_t end) {
		for (size_t c = begin; c < end; c++) {
			double sum, sum2;
			sum = sum2 = 0;

			for (int r = rstart; r < rend; r++) {
				double d = mat(r,c);
				sum += d;
				sum2 += pow(d,2);
			}

			rsum[c] = sum;
			rstdev[c] = sqrt(nperiod * sum2 - pow(sum,2));
		}
	}
};
// compute correlation
struct cor_p2 : public Worker {
	const RMatrix<double> mat;
	const int rstart, rend, nperiod;
	const RVector<double> sum, stdev;
    
	RMatrix<double> rmat;

	cor_p2(const NumericMatrix& mat, const int rstart, const int rend,
		const NumericVector& sum, const NumericVector& stdev, 
		NumericMatrix rmat)
		: mat(mat), rstart(rstart), rend(rend), nperiod(rend - rstart), sum(sum), stdev(stdev), rmat(rmat) {}

	void operator()(size_t begin, size_t end) {
		for (size_t c1 = begin; c1 < end; c1++) {
			for (size_t c2 = 0; c2 < c1; c2++) {
				double sXY = 0;
				for (int r = rstart; r < rend; r++)
					sXY += mat(r,c1) * mat(r,c2);

				rmat(c1,c2) = (nperiod * sXY - sum[c1] * sum[c2]) / (stdev[c1] * stdev[c2]);         
			}
		}
	}
};

inline NumericMatrix cp_cor_helper(const NumericMatrix& mat, const int rstart, const int rend) {
	int nc = mat.ncol();
	NumericVector rsum(nc), rstdev(nc);

	cor_p1 p1(mat, rstart, rend, rsum, rstdev);
	parallelFor(0, nc, p1);

	NumericMatrix rmat(nc, nc);

	cor_p2 p2(mat, rstart, rend, rsum, rstdev, rmat);
	parallelFor(0, nc, p2);

	return rmat;
}

// [[Rcpp::export]]
NumericMatrix cp_cor(NumericMatrix mat) {
	return cp_cor_helper(mat, 0, mat.nrow());
}

Please save above code in the correlation.cpp file or download correlation.cpp.

Next let’s make sure it produces same results and compare the run times.

#*****************************************************************
# RcppParallel Correlation
#*****************************************************************
# make sure to install [Rtools on windows](http://cran.r-project.org/bin/windows/Rtools/)
load.packages('Rcpp')
load.packages('RcppParallel')
# [RcppParallel help site](http://rcppcore.github.io/RcppParallel/)
# [RcppParallel source code](https://github.com/RcppCore/RcppParallel)
# [RcppParallel introduction](http://dirk.eddelbuettel.com/blog/2014/07/16/#introducing_rcppparallel)
# [RcppParallel gallery](http://gallery.rcpp.org/articles/parallel-distance-matrix/)
# defaultNumThreads()

# load Rcpp functions
sourceCpp('correlation.cpp')

r.cor = rtest(ret)
c.cor = c_cor(ret)
cp.cor = cp_cor(ret)
print(test.equality(r.cor, c.cor, cp.cor, type='all'))
item1 item2 equal
r.cor c.cor TRUE
r.cor cp.cor TRUE
c.cor cp.cor TRUE
# compare performance
library(rbenchmark)
res <- benchmark(cor(ret),
                 c_cor(ret),
                 cp_cor(ret),
                 replications = 20,
                 order="relative")
print(res[,1:4])
rownames(x) test replications elapsed relative
3 cp_cor(ret) 20 0.14 1.000
2 c_cor(ret) 20 0.50 3.571
1 cor(ret) 20 0.52 3.714

The RcppParallel version is about 3.5 times faster.

(this report was produced on: 2015-04-11)