OpenCVのPhaseCorrelate関数を改造してPOCステレオをやってみる（その２）

(編集メモ）

高速化を検討します。

やり散らかし感のあるままのソースコードですが、興味がほかに移ってしまったので、あげておきます。FFTの順変換を先に全て計算してしまい大量のメモリに持たせています。

Core i7-4770Sでtsukubaの処理におよそ1.5～3.0秒。リアルタイムまであと40倍。。。

#include "opencv2/opencv.hpp"

#include <immintrin.h>//AVX: -mavx

#include <omp.h>

using namespace cv;

#if _DEBUG

#pragma comment(lib, "opencv_core248d.lib")

#pragma comment(lib, "opencv_highgui248d.lib")

#pragma comment(lib, "opencv_imgproc248d.lib")

#pragma comment(lib, "opencv_contrib248d.lib")

#else

#pragma comment(lib, "opencv_core248.lib")

#pragma comment(lib, "opencv_highgui248.lib")

#pragma comment(lib, "opencv_imgproc248.lib")

#pragma comment(lib, "opencv_contrib248.lib")

#endif

// インクルードパス、ライブラリパスをプロジェクトに登録してください。

void vec_mul(const size_t n, double *z, const double *x, const double *y);

static void mulSpectrums1D( Mat& srcA, Mat& srcB, Mat& dst, int flags, bool conjB );

static void magSpectrums1D( Mat &src, Mat &dst);

static void divSpectrums1D( Mat &srcA, Mat &srcB, Mat &dst, int flags, bool conjB);

static void fftShift1D( Mat &out);

void vec_mul(const size_t n, double *z, const double *x, const double *y){

static const size_t double_size = 4; //単精度は8つずつ計算

const size_t end = n / double_size;

//AVX 専用の型にデータをロードする

__m256d *vz = (__m256d *)z;

__m256d *vx = (__m256d *)x;

__m256d *vy = (__m256d *)y;

for(size_t i=0; i<end; ++i)

vz[i] = _mm256_mul_pd(vx[i], vy[i]); //AVX を用いる SIMD 演算

}

cv::Point2d phaseCorrelate1DWithoutDFT(vector<Mat> &fft1, vector<Mat> &fft2, int pu, int pv, int qu, int qv, int rows, int cols, int ww, int wh, int k=65535, int flag_pixel = false)

{

int M=wh;

int N=ww;

vector<Mat> C;

C.resize(M);

#pragma omp parallel for

for(int i=0; i<M; i++){

Mat P, Pm;

mulSpectrums1D(fft1[(pv+i)*cols+pu], fft2[(qv+i)*cols+qu], P, 0, true);

magSpectrums1D(P, Pm);

divSpectrums1D(P, Pm, C[i], 0, false); // FF* / |FF*| (phase correlation equation completed here...)

if(0<k&&k<C[i].cols){

Rect lpf_roi(k,0,C[i].cols-k,1);

Mat zero = C[i](lpf_roi);

zero = Mat::zeros(zero.size(), zero.type());

}

}

Mat Rall;

if(M<11){

#pragma omp parallel for

for(int i=0; i<M; i++){

Rall = Rall + C[i] / M;

}

}

else{

vector<double> hann;

hann.resize(M);

double sum_hann = 0;

for(int i=0; i<M; i++){

hann[i] = 0.5 - 0.5*cos(2.0 * CV_PI * static_cast<double>(i) / M); // ww-1

sum_hann += hann[i];

}

#pragma omp parallel for

for(int i=0; i<M; i++){

Rall = Rall + C[i] * hann[i] / sum_hann;

}

}

idft(Rall, Rall); // gives us the nice peak shift location...

fftShift1D(Rall); // shift the energy to the center of the frame.

// locate the highest peak

Point peakLoc;

minMaxLoc(Rall, NULL, NULL, NULL, &peakLoc);

Point2d t(0,0);

if(!flag_pixel&&(peakLoc.x>=1)&&(peakLoc.x<Rall.cols-1)){

t.x = (Rall.at<double>(0, peakLoc.x-1)-Rall.at<double>(0, peakLoc.x+1))/(2.0*Rall.at<double>(0,peakLoc.x-1)-4.0*Rall.at<double>(0,peakLoc.x)+2.0*Rall.at<double>(0,peakLoc.x+1));

}

// adjust shift relative to image center...

Point2d center( (double)ww/ 2.0, 0.0);

C.clear();

t.x = t.x + peakLoc.x;

return (center - t);

}

int main(int argc, char* argv[])

{

// Parameters

Mat tempI = imread("tsukuba_r.png", 0);

Mat tempJ = imread("tsukuba_l.png", 0);

if(tempI.empty()||tempJ.empty()) return -1;

double mult_scale = 2.0; // １階層ごとの画像サイズの縮尺

int thresh = tempI.cols/4; // 階層の深さ決定の目安

int ww = getOptimalDFTSize(32); // 窓の幅半値幅が有効幅

int wh = 17; // 窓の高さ

// Decision of the Image Hierarchical Level

int lmax=0;

for(;;){

int value = (int)(pow(mult_scale,-(double)(lmax++))*tempI.cols);

if(thresh>value) break;

}

vector<int> width, height;

width.resize(lmax);

height.resize(lmax);

for(int i=0; i<lmax; i++){

width[i] = (int)(pow(mult_scale,-(i))*tempI.cols);

height[i] = tempI.rows;

}

Mat hann = Mat(1,ww,CV_64F);

for(int i=0; i<ww; i++){

hann.at<double>(i) = 0.5 - 0.5*cos(2.0 * CV_PI * static_cast<double>(i) / (ww)); // ww-1

}

Mat disparity = Mat::zeros( Size(tempI.cols, tempI.rows), CV_64F);

vector<vector<Mat> > dfti(lmax, vector<Mat>(tempI.cols*(tempI.rows+ww+1)));

vector<vector<Mat> > dftj(lmax, vector<Mat>(tempJ.cols*(tempJ.rows+ww+1)));

// Step1: Image Pyramid

vector<Mat> I, J, LI, LJ;

LI.resize(lmax);

LJ.resize(lmax);

I.resize(lmax);

J.resize(lmax);

#pragma omp parallel for

for(int l=0; l<lmax; l++){

resize(tempI, I[l], Size(width[l], height[l]), 0, 0, 1);

resize(tempJ, J[l], Size(width[l], height[l]), 0, 0, 1);

copyMakeBorder(I[l], LI[l], wh/2, wh/2+1, ww/2, ww/2, BORDER_CONSTANT, Scalar::all(0));

copyMakeBorder(J[l], LJ[l], wh/2, wh/2+1, ww/2, ww/2, BORDER_CONSTANT, Scalar::all(0));

}

for(int l=lmax-1; l>=0; l--){

for(int v= 0; v<(int)LI[l].rows; v++){

#pragma omp parallel for

for(int u=0; u<(int)I[l].cols; u++){

Rect roi(u,v,ww,1);

Mat Iroi = LI[l](roi);

Mat I64f, J64f;

Iroi.convertTo(I64f, CV_64F);

multiply(hann, I64f, I64f);

dft(I64f, dfti[l][v*I[l].cols+u], DFT_REAL_OUTPUT);

Mat Jroi = LJ[l](roi);

Jroi.convertTo(J64f, CV_64F);

multiply(hann, J64f, J64f);

dft(J64f, dftj[l][v*J[l].cols+u], DFT_REAL_OUTPUT);

}

}

}

for(int v= 0; v<(int)I[0].rows; v++){

#pragma omp parallel for

for(int u=0; u<(int)I[0].cols; u++){

int error = 0;

vector<Point2d> P, Q, Qd;

P.resize(lmax+1);

Q.resize(lmax+1);

Qd.resize(lmax+1);

// Step2:

P[lmax] = Point2d(pow(mult_scale,-lmax)*u,v);

Q[lmax] = Point2d(pow(mult_scale,-lmax)*u,v);

for(int l=lmax-1; l>=0; l--){

// Step3:

P[l] = Point2d(pow(mult_scale,-l)*u,v);

Qd[l] = Point2d(mult_scale*Q[l+1].x,Q[l+1].y);

// Step4:

if( (Qd[l].x<0)||(LJ[l].cols<Qd[l].x+ww)){

Q[l] = P[l];

continue;

}

Point2d shift = phaseCorrelate1DWithoutDFT(dfti[l], dftj[l], P[l].x, P[l].y, Qd[l].x, Qd[l].y, I[l].rows, I[l].cols, ww, wh, ww/2, true);

if( P[l].x < (shift.x+Qd[l].x)){

Q[l] = Qd[l]+Point2d(shift.x, 0);

}

else {

Q[l] = Qd[l];

}

} // Step5

{ // Step6

Rect i_roi( (int)P[0].x,(int)P[0].y,ww,wh);

if( (Q[0].x<0)||(LJ[0].cols<Q[0].x+ww)){

error = 1;

goto end;

}

Rect j_roi = Rect( (int)Q[0].x,(int)Q[0].y,ww,wh);

Point2d shift = phaseCorrelate1DWithoutDFT(dfti[0], dftj[0], P[0].x, P[0].y, Q[0].x, Q[0].y, I[0].rows, I[0].cols, ww, wh, ww/2, false);

if( P[0].x < (shift.x+Q[0].x)){

Q[0] = Q[0]+Point2d(shift.x, 0);

}

#if _DEBUG

Mat matchedImg = Mat::zeros(Size(I[0].cols*2,0), CV_8UC1);

vconcat(I[0],J[0],matchedImg);

cvtColor( matchedImg, matchedImg, COLOR_GRAY2RGB);

line(matchedImg,Point(P[0].x,P[0].y),Point(Q[0].x,matchedImg.rows/2+Q[0].y),Scalar(0,0,255));

imshow("matched", matchedImg);

waitKey(1);

#endif

}

end:

double d = Q[0].x-P[0].x;

double view_band = 1.0; // エラー閾値、表示調整用

if(view_band*ww<d||error) d=0;

disparity.at<double>(v, u) = d;

}

}

// http://stackoverflow.com/questions/13840013/opencv-how-to-visualize-a-depth-image

double min, max;

cv::minMaxIdx(disparity, &min, &max);

Mat adjMap;

double scale = 255/(max-min);

disparity.convertTo(adjMap, CV_8UC1, scale, -min*scale);

cv::Mat falseColorMap;

applyColorMap(adjMap, falseColorMap, cv::COLORMAP_JET);

imshow("disparity", falseColorMap);

imwrite("disparity.bmp", falseColorMap);

waitKey();

return 0;

}

static void mulSpectrums1D( Mat& srcA, Mat& srcB,

Mat& dst, int flags, bool conjB )

{

int depth = srcA.depth(), cn = srcA.channels(), type = srcA.type();

int rows = srcA.rows, cols = srcA.cols;

int j, k;

CV_Assert( type == srcB.type() && srcA.size() == srcB.size() );

CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );

dst.create( srcA.rows, srcA.cols, type );

bool is_1d = (flags & DFT_ROWS) || (rows == 1 || (cols == 1 &&

srcA.isContinuous() && srcB.isContinuous() && dst.isContinuous()));

if( is_1d && !(flags & DFT_ROWS) )

cols = cols + rows - 1, rows = 1;

int ncols = cols*cn;

int j0 = cn == 1;

int j1 = ncols - (cols % 2 == 0 && cn == 1);

const double* dataA = (const double*)srcA.data;

const double* dataB = (const double*)srcB.data;

double* dataC = (double*)dst.data;

size_t stepA = srcA.step/sizeof(dataA[0]);

size_t stepB = srcB.step/sizeof(dataB[0]);

size_t stepC = dst.step/sizeof(dataC[0]);

dataC[0] = dataA[0]*dataB[0];

if( cols % 2 == 0 )

dataC[j1] = dataA[j1]*dataB[j1];

if( !conjB )

for( j = j0; j < j1; j += 2 )

{

double re = dataA[j]*dataB[j] - dataA[j+1]*dataB[j+1];

double im = dataA[j+1]*dataB[j] + dataA[j]*dataB[j+1];

dataC[j] = re; dataC[j+1] = im;

}

else

for( j = j0; j < j1; j += 2 )

{

double re = dataA[j]*dataB[j] + dataA[j+1]*dataB[j+1];

double im = dataA[j+1]*dataB[j] - dataA[j]*dataB[j+1];

dataC[j] = re; dataC[j+1] = im;

}

}

static void magSpectrums1D( Mat &src, Mat &dst)

{

int depth = src.depth(), cn = src.channels(), type = src.type();

int rows = src.rows, cols = src.cols;

int j, k;

CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );

dst.create( src.rows, src.cols, CV_64FC1 );

dst.setTo(0);//Mat elements are not equal to zero by default!

bool is_1d = (rows == 1 || (cols == 1 && src.isContinuous() && dst.isContinuous()));

if( is_1d )

cols = cols + rows - 1, rows = 1;

int ncols = cols*cn;

int j0 = cn == 1;

int j1 = ncols - (cols % 2 == 0 && cn == 1);

const double* dataSrc = (const double*)src.data;

double* dataDst = (double*)dst.data;

size_t stepSrc = src.step/sizeof(dataSrc[0]);

size_t stepDst = dst.step/sizeof(dataDst[0]);

dataDst[0] = dataSrc[0]*dataSrc[0];

if( cols % 2 == 0 )

dataDst[j1] = dataSrc[j1]*dataSrc[j1];

for( j = j0; j < j1; j += 2 )

{

dataDst[j] = std::sqrt(dataSrc[j]*dataSrc[j] + dataSrc[j+1]*dataSrc[j+1]);

}

}

static void divSpectrums1D( Mat &srcA, Mat &srcB, Mat &dst, int flags, bool conjB)

{

int depth = srcA.depth(), cn = srcA.channels(), type = srcA.type();

int rows = srcA.rows, cols = srcA.cols;

int j, k;

CV_Assert( type == srcB.type() && srcA.size() == srcB.size() );

CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );

dst.create( srcA.rows, srcA.cols, type );

bool is_1d = (flags & DFT_ROWS) || (rows == 1 || (cols == 1 &&

srcA.isContinuous() && srcB.isContinuous() && dst.isContinuous()));

if( is_1d && !(flags & DFT_ROWS) )

cols = cols + rows - 1, rows = 1;

int ncols = cols*cn;

int j0 = cn == 1;

int j1 = ncols - (cols % 2 == 0 && cn == 1);

const double* dataA = (const double*)srcA.data;

const double* dataB = (const double*)srcB.data;

double* dataC = (double*)dst.data;

double eps = DBL_EPSILON; // prevent div0 problems

size_t stepA = srcA.step/sizeof(dataA[0]);

size_t stepB = srcB.step/sizeof(dataB[0]);

size_t stepC = dst.step/sizeof(dataC[0]);

if( is_1d && cn == 1 )

{

dataC[0] = dataA[0] / (dataB[0] + eps);

if( cols % 2 == 0 )

dataC[j1] = dataA[j1] / (dataB[j1] + eps);

}

if( !conjB )

for( j = j0; j < j1; j += 2 )

{

double denom = dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps;

double re = dataA[j]*dataB[j] + dataA[j+1]*dataB[j+1];

double im = dataA[j+1]*dataB[j] - dataA[j]*dataB[j+1];

dataC[j] = re / denom;

dataC[j+1] = im / denom;

}

else

for( j = j0; j < j1; j += 2 )

{

double denom = dataB[j]*dataB[j] + dataB[j+1]*dataB[j+1] + eps;

double re = dataA[j]*dataB[j] - dataA[j+1]*dataB[j+1];

double im = dataA[j+1]*dataB[j] + dataA[j]*dataB[j+1];

dataC[j] = re / denom;

dataC[j+1] = im / denom;

}

}

static void fftShift1D(Mat &out)

{

if(out.rows == 1 && out.cols == 1)

{

// trivially shifted.

return;

}

std::vector<Mat> planes;

split(out, planes);

int xMid = out.cols >> 1;

int yMid = out.rows >> 1;

bool is_1d = xMid == 0 || yMid == 0;

xMid = xMid + yMid;

for(size_t i = 0; i < planes.size(); i++)

{

Mat tmp;

Mat half0(planes[i], Rect(0, 0, xMid, 1));

Mat half1(planes[i], Rect(xMid, 0, xMid, 1));

half0.copyTo(tmp);

half1.copyTo(half0);

tmp.copyTo(half1);

}

merge(planes, out);

}

・OpenCVのDFTよりも大浦さんFFTの方が2倍近く早い

・順FFTは画像全画素に対して計算してしまう

・SIMD

大浦さんのFFT(rdft)の変換結果

2492.17 26.841 -1320.47 -87.0134 -7.17704 -60.3765 51.3907 -3.14701 31.0883 -101.097 54.217 111.956 -31.8355 50.0337 -39.1108 -24.2204 -6.02643 -8.15339 -1.7731

8 -34.7583 33.0092 1.28083 19.6931 22.3442 -16.5685 18.5711 -22.3404 8.28457 -13.9967 -24.1731 10.3953 -20.5076

r[0]r[N]r[1]i[1]・・・

OpenCVのDFTの変換結果

2492.17 -1320.47 87.0134 -7.17704 60.3765 51.3907 3.14701 31.0883 101.097 54.217 -111.956 -31.8355 -50.0337 -39.1108 24.2204 -6.02643 8.15339 -1.77318 34.7583 3

3.0092 -1.28083 19.6931 -22.3442 -16.5685 -18.5711 -22.3404 -8.28457 -13.9967 24.1731 10.3953 20.5076 26.841

r[0]r[1]i[1]・・・r[N]

r[1]の配置と、大浦さんFFTは虚数部の符号が共役になっていることが違う。

処理速度メモ

630.583:1118.1=大浦さんFFTの計算時間:OpenCVDFT計算時間

ハン窓適用しない場合　425.723

ハン窓適用AVX2使用　503.28

無理やりfloat 408

2次元ならFastTemplateMatchingとほぼ同じで、OpenCVのcvMatchTemplateに実装されている。うーん、やっぱり素直にテンプレートマッチングでいいか。。。

BM 6.012294ms

f:id:cvl-robot:20140213193850j:plain