diff --git a/FDTD/engine_sse.cpp b/FDTD/engine_sse.cpp
new file mode 100644
index 0000000..e434b68
--- /dev/null
+++ b/FDTD/engine_sse.cpp
@@ -0,0 +1,145 @@
+/*
+* Copyright (C) 2010 Thorsten Liebig (Thorsten.Liebig@gmx.de)
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation, either version 3 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program. If not, see .
+*/
+
+#include "engine_sse.h"
+#include "tools/array_ops.h"
+
+//! \brief construct an Engine_sse instance
+//! it's the responsibility of the caller to free the returned pointer
+Engine_sse* Engine_sse::New(const Operator* op)
+{
+ Engine_sse* e = new Engine_sse(op);
+ e->Init();
+ return e;
+}
+
+Engine_sse::Engine_sse(const Operator* op) : Engine(op)
+{
+ Op = op;
+ for (int n=0;n<3;++n)
+ {
+ numLines[n] = Op->GetNumberOfLines(n);
+ }
+}
+
+Engine_sse::~Engine_sse()
+{
+ this->Reset();
+}
+
+void Engine_sse::Init()
+{
+ numTS = 0;
+ volt_ = Create_N_3DArray_v4sf(numLines);
+ curr = Create_N_3DArray(numLines);
+}
+
+void Engine_sse::Reset()
+{
+ Delete_N_3DArray_v4sf(volt_,numLines);
+ volt=NULL;
+ Delete_N_3DArray(curr,numLines);
+ curr=NULL;
+}
+
+void Engine_sse::UpdateVoltages()
+{
+ unsigned int pos[4];
+ bool shift[3];
+
+ //voltage updates
+ for (pos[0]=0;pos[0]vv_[0][pos[0]][pos[1]][pos[2]].v;
+ volt_[1][pos[0]][pos[1]][pos[2]].v *= Op->vv_[1][pos[0]][pos[1]][pos[2]].v;
+ volt_[2][pos[0]][pos[1]][pos[2]].v *= Op->vv_[2][pos[0]][pos[1]][pos[2]].v;
+
+
+ for (pos[3]=0;pos[3]<4;++pos[3]) {
+ shift[2]=pos[2]+pos[3];
+ volt_[0][pos[0]][pos[1]][pos[2]].f[pos[3]] += Op->vi_[0][pos[0]][pos[1]][pos[2]].f[pos[3]] * ( curr[2][pos[0]][pos[1]][pos[2]] - curr[2][pos[0]][pos[1]-shift[1]][pos[2]] - curr[1][pos[0]][pos[1]][pos[2]] + curr[1][pos[0]][pos[1]][pos[2]-shift[2]]);
+ volt_[1][pos[0]][pos[1]][pos[2]].f[pos[3]] += Op->vi_[1][pos[0]][pos[1]][pos[2]].f[pos[3]] * ( curr[0][pos[0]][pos[1]][pos[2]] - curr[0][pos[0]][pos[1]][pos[2]-shift[2]] - curr[2][pos[0]][pos[1]][pos[2]] + curr[2][pos[0]-shift[0]][pos[1]][pos[2]]);
+ volt_[2][pos[0]][pos[1]][pos[2]].f[pos[3]] += Op->vi_[2][pos[0]][pos[1]][pos[2]].f[pos[3]] * ( curr[1][pos[0]][pos[1]][pos[2]] - curr[1][pos[0]-shift[0]][pos[1]][pos[2]] - curr[0][pos[0]][pos[1]][pos[2]] + curr[0][pos[0]][pos[1]-shift[1]][pos[2]]);
+ }
+ }
+ }
+ }
+}
+
+void Engine_sse::ApplyVoltageExcite()
+{
+ int exc_pos;
+ //soft voltage excitation here (E-field excite)
+ for (unsigned int n=0;nE_Exc_Count;++n)
+ {
+ exc_pos = (int)numTS - (int)Op->E_Exc_delay[n];
+ exc_pos *= (exc_pos>0 && exc_pos<=(int)Op->ExciteLength);
+// if (n==0) cerr << numTS << " => " << Op->ExciteSignal[exc_pos] << endl;
+ volt[Op->E_Exc_dir[n]][Op->E_Exc_index[0][n]][Op->E_Exc_index[1][n]][Op->E_Exc_index[2][n]] += Op->E_Exc_amp[n]*Op->ExciteSignal[exc_pos];
+ }
+}
+
+void Engine_sse::UpdateCurrents()
+{
+ unsigned int pos[3];
+ for (pos[0]=0;pos[0]ii[0][pos[0]][pos[1]][pos[2]];
+ curr[0][pos[0]][pos[1]][pos[2]] += Op->iv[0][pos[0]][pos[1]][pos[2]] * ( volt[2][pos[0]][pos[1]][pos[2]] - volt[2][pos[0]][pos[1]+1][pos[2]] - volt[1][pos[0]][pos[1]][pos[2]] + volt[1][pos[0]][pos[1]][pos[2]+1]);
+
+ //for y
+ curr[1][pos[0]][pos[1]][pos[2]] *= Op->ii[1][pos[0]][pos[1]][pos[2]];
+ curr[1][pos[0]][pos[1]][pos[2]] += Op->iv[1][pos[0]][pos[1]][pos[2]] * ( volt[0][pos[0]][pos[1]][pos[2]] - volt[0][pos[0]][pos[1]][pos[2]+1] - volt[2][pos[0]][pos[1]][pos[2]] + volt[2][pos[0]+1][pos[1]][pos[2]]);
+
+ //for z
+ curr[2][pos[0]][pos[1]][pos[2]] *= Op->ii[2][pos[0]][pos[1]][pos[2]];
+ curr[2][pos[0]][pos[1]][pos[2]] += Op->iv[2][pos[0]][pos[1]][pos[2]] * ( volt[1][pos[0]][pos[1]][pos[2]] - volt[1][pos[0]+1][pos[1]][pos[2]] - volt[0][pos[0]][pos[1]][pos[2]] + volt[0][pos[0]][pos[1]+1][pos[2]]);
+ }
+ }
+ }
+}
+
+void Engine_sse::ApplyCurrentExcite()
+{
+}
+
+bool Engine_sse::IterateTS(unsigned int iterTS)
+{
+ for (unsigned int iter=0;iter.
+*/
+
+#ifndef ENGINE_SSE_H
+#define ENGINE_SSE_H
+
+#include "operator.h"
+#include "engine.h"
+
+class Engine_sse : public Engine
+{
+public:
+ static Engine_sse* New(const Operator* op);
+ virtual ~Engine_sse();
+
+ virtual void Init();
+ virtual void Reset();
+
+ //!Iterate a number of timesteps
+ virtual bool IterateTS(unsigned int iterTS);
+
+ virtual unsigned int GetNumberOfTimesteps() {return numTS;};
+
+// virtual f4vector**** GetVoltages() {return volt;};
+ virtual FDTD_FLOAT**** GetCurrents() {return curr;};
+
+protected:
+ Engine_sse(const Operator* op);
+ const Operator* Op;
+
+ virtual void UpdateVoltages();
+ virtual void ApplyVoltageExcite();
+ virtual void UpdateCurrents();
+ virtual void ApplyCurrentExcite();
+
+ unsigned int numLines[3];
+
+ f4vector**** volt_;
+ FDTD_FLOAT**** curr;
+ unsigned int numTS;
+};
+
+#endif // ENGINE_SSE_H
diff --git a/FDTD/operator.cpp b/FDTD/operator.cpp
index 8314cb6..429e863 100644
--- a/FDTD/operator.cpp
+++ b/FDTD/operator.cpp
@@ -47,6 +47,8 @@ void Operator::Init()
E_Exc_dir=NULL;
vv=NULL;
vi=NULL;
+ vv_=NULL;
+ vi_=NULL;
iv=NULL;
ii=NULL;
for (int n=0;n<3;++n)
@@ -75,6 +77,8 @@ void Operator::Reset()
delete[] E_Exc_amp;
Delete_N_3DArray(vv,numLines);
Delete_N_3DArray(vi,numLines);
+ Delete_N_3DArray_v4sf(vv_,numLines);
+ Delete_N_3DArray_v4sf(vi_,numLines);
Delete_N_3DArray(iv,numLines);
Delete_N_3DArray(ii,numLines);
for (int n=0;n<3;++n)
@@ -502,10 +506,14 @@ void Operator::InitOperator()
{
Delete_N_3DArray(vv,numLines);
Delete_N_3DArray(vi,numLines);
+ Delete_N_3DArray_v4sf(vv_,numLines);
+ Delete_N_3DArray_v4sf(vi_,numLines);
Delete_N_3DArray(iv,numLines);
Delete_N_3DArray(ii,numLines);
vv = Create_N_3DArray(numLines);
vi = Create_N_3DArray(numLines);
+ vv_ = Create_N_3DArray_v4sf(numLines);
+ vi_ = Create_N_3DArray_v4sf(numLines);
iv = Create_N_3DArray(numLines);
ii = Create_N_3DArray(numLines);
}
@@ -516,6 +524,9 @@ inline void Operator::Calc_ECOperatorPos(int n, unsigned int* pos)
vv[n][pos[0]][pos[1]][pos[2]] = (1-dT*EC_G[n][i]/2/EC_C[n][i])/(1+dT*EC_G[n][i]/2/EC_C[n][i]);
vi[n][pos[0]][pos[1]][pos[2]] = (dT/EC_C[n][i])/(1+dT*EC_G[n][i]/2/EC_C[n][i]);
+ vv_[n][pos[0]][pos[1]][pos[2]/4].f[pos[2]%4] = vv[n][pos[0]][pos[1]][pos[2]];
+ vi_[n][pos[0]][pos[1]][pos[2]/4].f[pos[2]%4] = vi[n][pos[0]][pos[1]][pos[2]];
+
ii[n][pos[0]][pos[1]][pos[2]] = (1-dT*EC_R[n][i]/2/EC_L[n][i])/(1+dT*EC_R[n][i]/2/EC_L[n][i]);
iv[n][pos[0]][pos[1]][pos[2]] = (dT/EC_L[n][i])/(1+dT*EC_R[n][i]/2/EC_L[n][i]);
}
diff --git a/FDTD/operator.h b/FDTD/operator.h
index ee1a9b4..899b91f 100644
--- a/FDTD/operator.h
+++ b/FDTD/operator.h
@@ -24,6 +24,21 @@
#define FDTD_FLOAT float
+
+#if __SIZEOF_FLOAT__ != 4
+ #error wrong size of float
+#endif
+
+typedef float v4sf __attribute__ ((vector_size (16))); // vector of four single floats
+
+union f4vector
+{
+ v4sf v;
+ float f[4];
+};
+
+
+
//! Abstract base-class for the FDTD-operator
class Operator
{
@@ -130,6 +145,8 @@ public:
FDTD_FLOAT**** vi; //calc new voltage from old current
FDTD_FLOAT**** ii; //calc new current from old current
FDTD_FLOAT**** iv; //calc new current from old voltage
+ f4vector**** vv_; //calc new voltage from old voltage
+ f4vector**** vi_; //calc new voltage from old current
//Excitation time-signal
unsigned int ExciteLength;
diff --git a/openEMS.pro b/openEMS.pro
index 6ab1fba..8a3e634 100644
--- a/openEMS.pro
+++ b/openEMS.pro
@@ -38,7 +38,8 @@ SOURCES += main.cpp \
openems.cpp \
FDTD/engine_multithread.cpp \
FDTD/operator_cylinder.cpp \
- FDTD/engine_cylinder.cpp
+ FDTD/engine_cylinder.cpp \
+ FDTD/engine_sse.cpp
HEADERS += tools/ErrorMsg.h \
tools/AdrOp.h \
tools/constants.h \
@@ -54,10 +55,11 @@ HEADERS += tools/ErrorMsg.h \
openems.h \
FDTD/engine_multithread.h \
FDTD/operator_cylinder.h \
- FDTD/engine_cylinder.h
-QMAKE_CXXFLAGS_RELEASE = -O2 \
+ FDTD/engine_cylinder.h \
+ FDTD/engine_sse.h
+QMAKE_CXXFLAGS_RELEASE = -O3 \
-g \
- -march=native
+ -march=native
QMAKE_CXXFLAGS_DEBUG = -O0 \
-g \
-march=native
diff --git a/openems.cpp b/openems.cpp
index a918e6d..ff9f3d9 100644
--- a/openems.cpp
+++ b/openems.cpp
@@ -21,6 +21,7 @@
#include "FDTD/engine.h"
#include "FDTD/engine_cylinder.h"
#include "FDTD/engine_multithread.h"
+#include "FDTD/engine_sse.h"
#include "FDTD/processvoltage.h"
#include "FDTD/processcurrent.h"
#include "FDTD/processfields_td.h"
@@ -112,6 +113,12 @@ bool openEMS::parseCommandLineArgument( const char *argv )
cout << "openEMS - fixed number of threads: " << m_engine_numThreads << endl;
return true;
}
+ else if (strcmp(argv,"--engine=sse")==0)
+ {
+ cout << "openEMS - enabled sse engine" << endl;
+ m_engine = EngineType_SSE;
+ return true;
+ }
return false;
}
diff --git a/openems.h b/openems.h
index 475a5cd..2ccd7f3 100644
--- a/openems.h
+++ b/openems.h
@@ -61,7 +61,7 @@ protected:
Engine* FDTD_Eng;
ProcessingArray* PA;
- enum EngineType {EngineType_Standard,EngineType_Multithreaded};
+ enum EngineType {EngineType_Standard,EngineType_Multithreaded,EngineType_SSE};
EngineType m_engine;
unsigned int m_engine_numThreads;
};
diff --git a/tools/array_ops.cpp b/tools/array_ops.cpp
index 0393466..a5e1860 100644
--- a/tools/array_ops.cpp
+++ b/tools/array_ops.cpp
@@ -91,3 +91,70 @@ void Dump_N_3DArray2File(ostream &file, FDTD_FLOAT**** array, unsigned int* numL
}
}
}
+
+
+
+
+void Delete3DArray_v4sf(f4vector*** array, const unsigned int* numLines)
+{
+ if (array==NULL) return;
+ unsigned int pos[3];
+ for (pos[0]=0;pos[0]