...
 
Commits (227)
......@@ -4,6 +4,7 @@ variables:
GIT_SUBMODULE_STRATEGY: recursive
HYPERTHREADING: "false"
OMP_NUM_THREADS: 1
DUNECI_CXXFLAGS: "-march=native"
before_script:
- echo "Running make concurrently with ${DUNECI_PARALLEL} cores, where possibly"
......
......@@ -12,7 +12,7 @@
url = https://github.com/inducer/cgen.git
[submodule "dune/codegen/vectorclass"]
path = dune/codegen/vectorclass
url = https://github.com/darealshinji/vectorclass.git
url = https://github.com/vectorclass/version2.git
[submodule "python/pytools"]
path = python/pytools
url = https://github.com/inducer/pytools.git
cmake_minimum_required(VERSION 2.8.12)
cmake_minimum_required(VERSION 3.1.3)
project(dune-codegen CXX)
if(NOT (dune-common_DIR OR dune-common_ROOT OR
......@@ -21,6 +21,8 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDUNE_ISTL_SUPPORT_OLD_CATEGORY_INTERFA
# start a dune project with information from dune.module
dune_project()
dune_python_force_version(3)
dune_add_library(dunecodegen dune/codegen/common/tsc.cc)
dune_target_enable_all_packages(dunecodegen)
......@@ -29,6 +31,7 @@ dune_register_package_flags(LIBRARIES dunecodegen)
dune_enable_all_packages()
add_subdirectory(doc)
add_subdirectory(dune/codegen)
add_subdirectory(cmake/modules)
......
......@@ -105,6 +105,25 @@ ctest
Note that this takes quite a while.
## Building and Running dune-codegen in an offline environment
dune-codegen relies on installing Python packages into self-contained environments
during its configuration and build process. In order to do this in an offline
environment, we recommend using the tool `devpi`. One of its use cases is to provide
a local mirror for the Python package index. A quickstart tutorial for this use case
is available [5]. It boils down to the following:
* Installing the `devpi-server` package through your favorite method
* Setting up a local server with `devpi-server --init`
* Making sure it is running in the background (explicitly with `devpi-server --start/stop` or by configuring a systemd service.
* Have the environment variable `PIP_INDEX_URL` to its index, e.g. by adding this line to your `~/.bashrc` (where `http://localhost:3141` might differ depending on your devpi configuration):
```
export PIP_INDEX_URL=http://localhost:3141/root/pypi/+simple/
```
At first installation, the locally mirrored package index will access PyPI.
Later on, it will install packages from its local cache.
## Links
[0]: https://git-lfs.github.com/
......@@ -112,3 +131,4 @@ Note that this takes quite a while.
[2]: https://gitlab.dune-project.org/quality/dune-testtools
[3]: http://isl.gforge.inria.fr/
[4]: https://www.dune-project.org/doc/installation/
[5]: https://github.com/devpi/devpi/blob/master/doc/quickstart-pypimirror.rst
......@@ -5,8 +5,16 @@ import sys
import subprocess
import os
os.environ['DUNE_CODEGEN_THREADS'] = '20'
# Run the actual command
command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
# The actual measurements will be called like this (mpi parallelism)
# command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split()
# Command for autotune benchmarks with threading
command = "srun -p haswell10c -n 1 -c 20 --hint=nomultithread --cpu_bin=verbose,core".split()
command.extend(sys.argv[1:])
ret = subprocess.call(command)
......
# File for module specific CMake tests.
#
# .. cmake_function:: add_generated_executable
# .. cmake_function:: dune_add_generated_executable
#
# .. cmake_param:: UFLFILE
# :single:
......@@ -22,19 +22,20 @@
# The name given to the added executable target.
#
# .. cmake_param:: SOURCE
# :single:
#
# The cc source file to build from. If omitted, a minimal
# source file and a driver file will be generated.
#
# .. cmake_param:: FORM_COMPILER_ARGS
# :multi:
# :argname arg:
# :argname: arg
#
# Additional arguments as recognized by the form compiler.
#
# .. cmake_param:: DEPENDS
# :multi:
# :argname dep:
# :argname: dep
#
# Additional dependencies of the generated executable (changes in those
# will retrigger generation)
......@@ -57,7 +58,7 @@
#
# .. cmake_param:: ANALYZE_GRID_COMMAND
# :multi:
# :argname command:
# :argname: command
#
# Use this to pass a custom grid analysis command. This is necessary
# if you use a custom grid generation methdod. The inifile and the
......@@ -104,7 +105,7 @@ else()
endif()
file(GLOB_RECURSE UFL2PDELAB_SOURCES ${UFL2PDELAB_GLOB_PATTERN})
function(add_generated_executable)
function(dune_add_generated_executable)
set(OPTIONS EXCLUDE_FROM_ALL ANALYZE_GRID)
set(SINGLE TARGET SOURCE UFLFILE INIFILE)
set(MULTI FORM_COMPILER_ARGS DEPENDS ANALYZE_GRID_COMMAND)
......@@ -112,15 +113,20 @@ function(add_generated_executable)
cmake_parse_arguments(GEN "${OPTIONS}" "${SINGLE}" "${MULTI}" ${ARGN})
if(GEN_UNPARSED_ARGUMENTS)
message(FATAL_ERROR "Unrecognized arguments in add_generated_executable. This usually indicates a typo.")
message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.")
endif()
set(MPI_OPTION "0")
if(MPI_FOUND)
set(MPI_OPTION "1")
endif()
# Apply defaults and enforce requirements
if(NOT GEN_TARGET)
message(FATAL_ERROR "Need to specify the TARGET parameter for add_generated_executable")
message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable")
endif()
if(NOT GEN_UFLFILE)
message(FATAL_ERROR "Need to specify the UFLFILE parameter for add_generated_executable")
message(FATAL_ERROR "Need to specify the UFLFILE parameter for dune_add_generated_executable")
endif()
if(NOT IS_ABSOLUTE ${GEN_UFLFILE})
set(GEN_UFLFILE ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_UFLFILE})
......@@ -138,6 +144,7 @@ function(add_generated_executable)
--target-name ${GEN_TARGET}
--driver-file ${GEN_SOURCE}
--project-basedir ${CMAKE_BINARY_DIR}
--with-mpi ${MPI_OPTION}
${GEN_FORM_COMPILER_ARGS}
DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES}
COMMENT "Generating driver for the target ${GEN_TARGET}"
......@@ -171,10 +178,8 @@ function(add_generated_executable)
endif()
# Parse a mapping of operators to build and their respective filenames
dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET}
OUTPUT_VARIABLE depdata
)
parse_python_data(PREFIX depdata INPUT ${depdata})
dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
parse_python_data(PREFIX depdata FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
if(DUNE_CODEGEN_PROFILING)
# This is a bit silly, but cProfile only finds entry point scripts
......@@ -198,6 +203,7 @@ function(add_generated_executable)
--ini-file ${GEN_INIFILE}
--target-name ${GEN_TARGET}
--operator-to-build ${op}
--with-mpi ${MPI_OPTION}
${ANALYZE_GRID_OPTION}
DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} ${ANALYZE_GRID_FILE}
COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}"
......
# System testing for generated executables. All ideas taken from dune-testtools.
# System testing for generated executables. All ideas taken from dune-testtools.
#
# .. cmake_function:: dune_add_formcompiler_system_test
#
# .. cmake_param:: UFLFILE
# :single:
# :required:
#
# The UFL file to create the generate code from.
#
# .. cmake_param:: INIFILE
# :single:
# :required:
#
# The ini file that controls the form compilation process.
# It is expected to contain a [formcompiler] section. This
# file may contain meta ini annotations.
#
# .. cmake_param:: BASENAME
# :single:
# :required:
#
# The basename for the generated executables.
#
# .. cmake_param:: SCRIPT
# :single:
#
# The python script that decides about test failure/success.
# Defaults to a script that simply runs the program and checks
# the exit code. More scripts to be found in dune-testtools.
#
# .. cmake_param:: SOURCE
# :single:
#
# The cc source file to build from. If omitted, a minimal
# source file and a driver file will be generated.
#
# .. cmake_param:: CREATED_TARGETS
# :single:
#
# A variable name that should be filled with the list of generated
# targets. This can be used to modify these lateron.
#
# .. cmake_param:: DEPENDS
# :multi:
# :argname: dep
#
# Additional dependencies of the generated executable (changes in those
# will retrigger generation)
#
# .. cmake_param:: NO_TESTS
# :option:
#
# If given, code will be generated and built normally, but no tests will
# be added to the test suite.
#
# .. cmake_param:: ANALYZE_GRID
# :option:
#
# Set this option to enable code generation time grid analysis.
# This is useful to reduce the variety of sum factorization kernels
# in unstructured grids. Note that the grid analysis tool needs to
# be able to construct your grid from the given inifile. If you have
# a custom grid construction method, you can use ANALYZE_GRID_COMMAND
# instead.
#
# .. cmake_param:: ANALYZE_GRID_COMMAND
# :multi:
# :argname: command
#
# Use this to pass a custom grid analysis command. This is necessary
# if you use a custom grid generation methdod. The inifile and the
# outputfile will be appended to this command. You can use the analysis code in
# dune/codegen/sumfact/analyzegrid.hh to write your own tool.
# Specifying this option will automatically set ANALYZE_GRID.
#
function(dune_add_formcompiler_system_test)
# parse arguments
set(OPTION DEBUG NO_TESTS ANALYZE_GRID)
set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE)
set(MULTI CREATED_TARGETS DEPENDS)
set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE CREATED_TARGETS)
set(MULTI DEPENDS ANALYZE_GRID_COMMAND)
cmake_parse_arguments(SYSTEMTEST "${OPTION}" "${SINGLE}" "${MULTI}" ${ARGN})
if(SYSTEMTEST_UNPARSED_ARGUMENTS)
......@@ -25,6 +99,10 @@ function(dune_add_formcompiler_system_test)
if(SYSTEMTEST_ANALYZE_GRID)
set(ANALYZE_GRID_STR "ANALYZE_GRID")
endif()
set(ANALYZE_GRID_COMMAND_STR "")
if(SYSTEMTEST_ANALYZE_GRID_COMMAND)
set(ANALYZE_GRID_COMMAND_STR "ANALYZE_GRID_COMMAND ${SYSTEMTEST_ANALYZE_GRID_COMMAND}")
endif()
# set a default for the script. call_executable.py just calls the executable.
# There, it is also possible to hook in things depending on the inifile
......@@ -41,9 +119,14 @@ function(dune_add_formcompiler_system_test)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} ${CMAKE_CURRENT_BINARY_DIR}/tmp_${SYSTEMTEST_INIFILE})
# expand the given meta ini file into the build tree
execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py --cmake --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} --dir ${CMAKE_CURRENT_BINARY_DIR} --section formcompiler
OUTPUT_VARIABLE output)
parse_python_data(PREFIX INIINFO INPUT "${output}")
execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py
--cmake
--ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE}
--dir ${CMAKE_CURRENT_BINARY_DIR}
--section formcompiler
--file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
)
parse_python_data(PREFIX INIINFO FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
foreach(inifile ${INIINFO_names})
if(${INIINFO_${inifile}_suffix} STREQUAL "__empty")
......@@ -55,23 +138,25 @@ function(dune_add_formcompiler_system_test)
endif()
endif()
add_generated_executable(TARGET ${tname}
UFLFILE ${SYSTEMTEST_UFLFILE}
INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
EXCLUDE_FROM_ALL
${SOURCE}
${ANALYZE_GRID_STR}
)
dune_add_generated_executable(TARGET ${tname}
UFLFILE ${SYSTEMTEST_UFLFILE}
INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS}
EXCLUDE_FROM_ALL
${SOURCE}
${ANALYZE_GRID_STR}
${ANALYZE_GRID_COMMAND_STR}
)
# Enrich the target with preprocessor variables from the __static section
# just the way that dune-testtools does.
dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_extract_static.py
--ini ${inifile}
--file ${CMAKE_CURRENT_BINARY_DIR}/interface.log
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
OUTPUT_VARIABLE output
ERROR_MESSAGE "Error extracting static info from ${inifile}")
parse_python_data(PREFIX STAT INPUT "${output}")
parse_python_data(PREFIX STAT FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log)
foreach(config ${STAT___CONFIGS})
foreach(cd ${STAT___STATIC_DATA})
......@@ -90,7 +175,7 @@ function(dune_add_formcompiler_system_test)
_add_test(NAME ${tname}
COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env ${SYSTEMTEST_SCRIPT}
--exec ${tname}
--exec "$<TARGET_FILE_DIR:${tname}>/$<TARGET_FILE_NAME:${tname}>"
--ini "${CMAKE_CURRENT_BINARY_DIR}/${inifile}"
--source ${CMAKE_CURRENT_SOURCE_DIR}
--mpi-exec "${MPIEXEC}"
......
......@@ -7,7 +7,7 @@ from dune.testtools.cmakeoutput import printForCMake
import sys
ini = parse_ini_file(sys.argv[1])
section = ini["formcompiler"]
section = ini.get("formcompiler", {})
operators = section.get("operators", "r")
operators = [i.strip() for i in operators.split(",")]
......@@ -22,5 +22,5 @@ def get_filename(operator):
result = {"__{}".format(o): get_filename(o) for o in operators}
result["__operators"] = ";".join(operators)
printForCMake(result)
printForCMake(result, sys.argv[3])
sys.exit(0)
dune_cmake_sphinx_doc(MODULE_ONLY)
......@@ -4,6 +4,50 @@
#include<dune/codegen/common/vectorclass.hh>
// Only use our custom implementations if we have AVX2 or later!
#if INSTRSET >= 8
/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd
* instruction and instead uses the shuffle port.
*/
static inline double permuting_horizontal_add (const Vec2d & a)
{
return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a));
}
/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd
* instruction and instead uses the shuffle port.
*/
static inline double permuting_horizontal_add (const Vec4d& a)
{
__m128d valupper = _mm256_extractf128_pd(a, 1);
__m128d vallower = _mm256_castpd256_pd128(a);
__m128d valval = _mm_add_pd(valupper, vallower);
__m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
return _mm_cvtsd_f64(res);
}
#if MAX_VECTOR_SIZE >= 512
/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd
* instruction and instead uses the shuffle port.
*/
static inline double permuting_horizontal_add(const Vec8d& a)
{
return permuting_horizontal_add(a.get_low() + a.get_high());
}
#endif
#else
template<typename V>
static inline double permuting_horizontal_add (const V& a)
{
return horizontal_add(a);
}
#endif
template<class V>
typename base_floatingpoint<V>::value horizontal_add_lower(const V& x)
{
......@@ -16,4 +60,16 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x)
return horizontal_add(x.get_high());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
{
return permuting_horizontal_add(x.get_low());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
{
return permuting_horizontal_add(x.get_high());
}
#endif
#ifndef DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
#define DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH
#include<immintrin.h>
#include<dune/codegen/common/simdtraits.hh>
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x)
{
return horizontal_add(x.get_low());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x)
{
return horizontal_add(x.get_high());
}
template<class V>
typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x)
{
return horizontal_add(x);
}
#endif
......@@ -15,8 +15,8 @@ void transpose_reg(Vec2d& a0, Vec2d& a1)
void transpose_reg(Vec4f& a0, Vec4f& a1)
{
Vec4f b0, b1;
b0 = blend4f<0,1,4,5>(a0,a1);
b1 = blend4f<2,3,6,7>(a0,a1);
b0 = blend4<0,1,4,5>(a0,a1);
b1 = blend4<2,3,6,7>(a0,a1);
a0 = b0;
a1 = b1;
}
......@@ -25,14 +25,14 @@ void transpose_reg(Vec4f& a0, Vec4f& a1)
void transpose_reg(Vec4f& a0, Vec4f& a1, Vec4f& a2, Vec4f& a3)
{
Vec4f b0,b1,b2,b3;
b0 = blend4f<0,4,2,6>(a0,a1);
b1 = blend4f<1,5,3,7>(a0,a1);
b2 = blend4f<0,4,2,6>(a2,a3);
b3 = blend4f<1,5,3,7>(a2,a3);
a0 = blend4f<0,1,4,5>(b0,b2);
a1 = blend4f<0,1,4,5>(b1,b3);
a2 = blend4f<2,3,6,7>(b0,b2);
a3 = blend4f<2,3,6,7>(b1,b3);
b0 = blend4<0,4,2,6>(a0,a1);
b1 = blend4<1,5,3,7>(a0,a1);
b2 = blend4<0,4,2,6>(a2,a3);
b3 = blend4<1,5,3,7>(a2,a3);
a0 = blend4<0,1,4,5>(b0,b2);
a1 = blend4<0,1,4,5>(b1,b3);
a2 = blend4<2,3,6,7>(b0,b2);
a3 = blend4<2,3,6,7>(b1,b3);
}
#if MAX_VECTOR_SIZE >= 256
......@@ -41,8 +41,8 @@ void transpose_reg(Vec4f& a0, Vec4f& a1, Vec4f& a2, Vec4f& a3)
void transpose_reg(Vec4d& a0, Vec4d& a1)
{
Vec4d b0, b1;
b0 = blend4d<0,1,4,5>(a0,a1);
b1 = blend4d<2,3,6,7>(a0,a1);
b0 = blend4<0,1,4,5>(a0,a1);
b1 = blend4<2,3,6,7>(a0,a1);
a0 = b0;
a1 = b1;
}
......@@ -51,22 +51,22 @@ void transpose_reg(Vec4d& a0, Vec4d& a1)
void transpose_reg(Vec4d& a0, Vec4d& a1, Vec4d& a2, Vec4d& a3)
{
Vec4d b0,b1,b2,b3;
b0 = blend4d<0,4,2,6>(a0,a1);
b1 = blend4d<1,5,3,7>(a0,a1);
b2 = blend4d<0,4,2,6>(a2,a3);
b3 = blend4d<1,5,3,7>(a2,a3);
a0 = blend4d<0,1,4,5>(b0,b2);
a1 = blend4d<0,1,4,5>(b1,b3);
a2 = blend4d<2,3,6,7>(b0,b2);
a3 = blend4d<2,3,6,7>(b1,b3);
b0 = blend4<0,4,2,6>(a0,a1);
b1 = blend4<1,5,3,7>(a0,a1);
b2 = blend4<0,4,2,6>(a2,a3);
b3 = blend4<1,5,3,7>(a2,a3);
a0 = blend4<0,1,4,5>(b0,b2);
a1 = blend4<0,1,4,5>(b1,b3);
a2 = blend4<2,3,6,7>(b0,b2);
a3 = blend4<2,3,6,7>(b1,b3);
}
// FLOAT 8 x 2
void transpose_reg (Vec8f& a0, Vec8f& a1)
{
Vec8f b0, b1;
b0 = blend8f<0,1,2,3,8,9,10,11>(a0, a1);
b1 = blend8f<4,5,6,7,12,13,14,15>(a0, a1);
b0 = blend8<0,1,2,3,8,9,10,11>(a0, a1);
b1 = blend8<4,5,6,7,12,13,14,15>(a0, a1);
a0 = b0;
a1 = b1;
}
......@@ -75,14 +75,14 @@ void transpose_reg (Vec8f& a0, Vec8f& a1)
void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
{
Vec8f b0, b1, b2, b3;
b0 = blend8f<0,1,8,9,2,3,10,11>(a0, a1);
b1 = blend8f<4,5,12,13,6,7,14,15>(a0, a1);
b2 = blend8f<0,1,8,9,2,3,10,11>(a2, a3);
b3 = blend8f<4,5,12,13,6,7,14,15>(a2, a3);
a0 = blend8f<0,1,2,3,8,9,10,11>(b0, b2);
a1 = blend8f<4,5,6,7,12,13,14,15>(b0, b2);
a2 = blend8f<0,1,2,3,8,9,10,11>(b1, b3);
a3 = blend8f<4,5,6,7,12,13,14,15>(b1, b3);
b0 = blend8<0,1,8,9,2,3,10,11>(a0, a1);
b1 = blend8<4,5,12,13,6,7,14,15>(a0, a1);
b2 = blend8<0,1,8,9,2,3,10,11>(a2, a3);
b3 = blend8<4,5,12,13,6,7,14,15>(a2, a3);
a0 = blend8<0,1,2,3,8,9,10,11>(b0, b2);
a1 = blend8<4,5,6,7,12,13,14,15>(b0, b2);
a2 = blend8<0,1,2,3,8,9,10,11>(b1, b3);
a3 = blend8<4,5,6,7,12,13,14,15>(b1, b3);
}
namespace impl
......@@ -101,14 +101,14 @@ namespace impl
void _transpose4x8(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3)
{
Vec8f b0,b1,b2,b3;
b0 = blend8f<0,8,2,10,4,12,6,14>(a0,a1);
b1 = blend8f<1,9,3,11,5,13,7,15>(a0,a1);
b2 = blend8f<0,8,2,10,4,12,6,14>(a2,a3);
b3 = blend8f<1,9,3,11,5,13,7,15>(a2,a3);
a0 = blend8f<0,1,8,9,4,5,12,13>(b0,b2);
a1 = blend8f<0,1,8,9,4,5,12,13>(b1,b3);
a2 = blend8f<2,3,10,11,6,7,14,15>(b0,b2);
a3 = blend8f<2,3,10,11,6,7,14,15>(b1,b3);
b0 = blend8<0,8,2,10,4,12,6,14>(a0,a1);
b1 = blend8<1,9,3,11,5,13,7,15>(a0,a1);
b2 = blend8<0,8,2,10,4,12,6,14>(a2,a3);
b3 = blend8<1,9,3,11,5,13,7,15>(a2,a3);
a0 = blend8<0,1,8,9,4,5,12,13>(b0,b2);
a1 = blend8<0,1,8,9,4,5,12,13>(b1,b3);
a2 = blend8<2,3,10,11,6,7,14,15>(b0,b2);
a3 = blend8<2,3,10,11,6,7,14,15>(b1,b3);
}
}
......@@ -132,14 +132,14 @@ void transpose_reg(Vec8f& a0, Vec8f& a1, Vec8f& a2, Vec8f& a3,
void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
{
Vec8d b0, b1, b2, b3;
b0 = blend8d<0,1,8,9,2,3,10,11>(a0, a1);
b1 = blend8d<4,5,12,13,6,7,14,15>(a0, a1);
b2 = blend8d<0,1,8,9,2,3,10,11>(a2, a3);
b3 = blend8d<4,5,12,13,6,7,14,15>(a2, a3);
a0 = blend8d<0,1,2,3,8,9,10,11>(b0, b2);
a1 = blend8d<4,5,6,7,12,13,14,15>(b0, b2);
a2 = blend8d<0,1,2,3,8,9,10,11>(b1, b3);
a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3);
b0 = blend8<0,1,8,9,2,3,10,11>(a0, a1);
b1 = blend8<4,5,12,13,6,7,14,15>(a0, a1);
b2 = blend8<0,1,8,9,2,3,10,11>(a2, a3);
b3 = blend8<4,5,12,13,6,7,14,15>(a2, a3);
a0 = blend8<0,1,2,3,8,9,10,11>(b0, b2);
a1 = blend8<4,5,6,7,12,13,14,15>(b0, b2);
a2 = blend8<0,1,2,3,8,9,10,11>(b1, b3);
a3 = blend8<4,5,6,7,12,13,14,15>(b1, b3);
}
// DOUBLE 8 x 2
......@@ -149,8 +149,8 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
void transpose_reg (Vec8d& a0, Vec8d& a1)
{
Vec8d b0, b1;
b0 = blend8d<0,1,2,3,8,9,10,11>(a0, a1);
b1 = blend8d<4,5,6,7,12,13,14,15>(a0, a1);
b0 = blend8<0,1,2,3,8,9,10,11>(a0, a1);
b1 = blend8<4,5,6,7,12,13,14,15>(a0, a1);
a0 = b0;
a1 = b1;
}
......@@ -171,14 +171,14 @@ namespace impl
void _transpose4x8(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3)
{
Vec8d b0,b1,b2,b3;
b0 = blend8d<0,8,2,10,4,12,6,14>(a0,a1);
b1 = blend8d<1,9,3,11,5,13,7,15>(a0,a1);
b2 = blend8d<0,8,2,10,4,12,6,14>(a2,a3);
b3 = blend8d<1,9,3,11,5,13,7,15>(a2,a3);
a0 = blend8d<0,1,8,9,4,5,12,13>(b0,b2);
a1 = blend8d<0,1,8,9,4,5,12,13>(b1,b3);
a2 = blend8d<2,3,10,11,6,7,14,15>(b0,b2);
a3 = blend8d<2,3,10,11,6,7,14,15>(b1,b3);
b0 = blend8<0,8,2,10,4,12,6,14>(a0,a1);
b1 = blend8<1,9,3,11,5,13,7,15>(a0,a1);
b2 = blend8<0,8,2,10,4,12,6,14>(a2,a3);
b3 = blend8<1,9,3,11,5,13,7,15>(a2,a3);
a0 = blend8<0,1,8,9,4,5,12,13>(b0,b2);
a1 = blend8<0,1,8,9,4,5,12,13>(b1,b3);
a2 = blend8<2,3,10,11,6,7,14,15>(b0,b2);
a3 = blend8<2,3,10,11,6,7,14,15>(b1,b3);
}
}
......@@ -215,10 +215,10 @@ void transpose_reg(Vec16f& a0, Vec16f& a1)
void transpose_reg(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3)
{
Vec16f b0,b1,b2,b3;
b0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a0,a1);
b1 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a0,a1);
b2 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a2,a3);
b3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a2,a3);
b0 = blend16<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a0,a1);
b1 = blend16<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a0,a1);
b2 = blend16<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(a2,a3);
b3 = blend16<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(a2,a3);
swap_halves(b0,b2);
swap_halves(b1,b3);
a0 = b0;
......@@ -235,14 +235,14 @@ namespace impl
void _transpose4x16(Vec16f& a0, Vec16f& a1, Vec16f& a2, Vec16f& a3)
{
Vec16f b0,b1,b2,b3;
b0 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a0,a1);
b1 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a0,a1);
b2 = blend16f<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a2,a3);
b3 = blend16f<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a2,a3);
a0 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b0,b2);
a1 = blend16f<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b1,b3);
a2 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b0,b2);
a3 = blend16f<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b1,b3);
b0 = blend16<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a0,a1);
b1 = blend16<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a0,a1);
b2 = blend16<0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29>(a2,a3);
b3 = blend16<2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31>(a2,a3);
a0 = blend16<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b0,b2);
a1 = blend16<0,1,2,3,16,17,18,19,8,9,10,11,24,25,26,27>(b1,b3);
a2 = blend16<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b0,b2);
a3 = blend16<4,5,6,7,20,21,22,23,12,13,14,15,28,29,30,31>(b1,b3);
}
}
......
......@@ -11,17 +11,17 @@
template<std::size_t ...S>
void apply_function(std::array<VECTYPE, M>& data, std::index_sequence<S...>)
void apply_function(std::array<VECTYPE, _M>& data, std::index_sequence<S...>)
{
transpose_reg(std::get<S>(data) ...);
}
void print(const std::array<VECTYPE, M>& data)
void print(const std::array<VECTYPE, _M>& data)
{
for(int i=0; i<M; ++i)
for(int i=0; i<_M; ++i)
{
for(int j=0; j<N; ++j)
for(int j=0; j<_N; ++j)
std::cout << data[i].extract(j) << " ";
std::cout << std::endl;
}
......@@ -31,37 +31,37 @@ void print(const std::array<VECTYPE, M>& data)
int main()
{
// Setup data
std::array<VECTYPE, M> testdata;
std::array<VECTYPE, _M> testdata;
int data = 0;
for(int i=0; i<M; ++i)
for(int j=0; j<N; ++j, ++data)
for(int i=0; i<_M; ++i)
for(int j=0; j<_N; ++j, ++data)
testdata[i].insert(j, data);
std::array<VECTYPE, M> originaldata(testdata);
std::array<VECTYPE, _M> originaldata(testdata);
// Print the test data once
std::cout << "Before transposition:" << std::endl;
print(testdata);
// Apply the transpose function
apply_function(testdata, std::make_index_sequence<M>{});
apply_function(testdata, std::make_index_sequence<_M>{});
// And print the test data once more
std::cout << "After transposition:" << std::endl;
print(testdata);
int result = 0;
for(int i=0; i<M; ++i)
for(int j=0; j<N; ++j)
for(int i=0; i<_M; ++i)
for(int j=0; j<_N; ++j)
{
// Dirty handcoding of the generic permutation
auto NM = (int)N / (int)M;
auto linear = i * N + j;
auto NM = (int)_N / (int)_M;
auto linear = i * _N + j;
auto block = linear / NM;
auto newblock = (block % M) * M + (block / M);
auto newblock = (block % _M) * _M + (block / _M);
auto newlinear = newblock * NM + (linear % NM);
auto newj = newlinear % N;
auto newi = newlinear / N;
auto newj = newlinear % _N;
auto newi = newlinear / _N;
if(testdata[newi].extract(newj) != originaldata[i].extract(j))
{
......
__name = test_transpose_{__exec_suffix}
__exec_suffix = {__static.BASETYPE}_{__static.N}x{__static.M}
__exec_suffix = {__static.BASETYPE}_{__static._N}x{__static._M}
# These transposes do not make sense
{__static.M} > {__static.N} | exclude
{__static.N} == 2 and {shorttype} == f | exclude
{__static.N} == 16 and {shorttype} == d | exclude
{__static._M} > {__static._N} | exclude
{__static._N} == 2 and {shorttype} == f | exclude
{__static._N} == 16 and {shorttype} == d | exclude
# These transposes are not yet implemented
{__static.M} == 16 | exclude
{__static._M} == 16 | exclude
shorttype = f, d | expand base
[__static]
N = 2, 4, 8, 16 | expand n
M = 2, 4, 8, 16 | expand m
_N = 2, 4, 8, 16 | expand n
_M = 2, 4, 8, 16 | expand m
BASETYPE = float, double | expand base
VECTYPE = Vec{__static.N}{shorttype}
VECTYPE = Vec{__static._N}{shorttype}
Subproject commit 28d002ca908af1e7b5956c7bffef2bfed7923c6b
Subproject commit b7088e62a67bac982fc60d9bb51a319d313e4160
......@@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch
git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch
popd
pushd dune/codegen/vectorclass
git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch
popd
pushd python/ufl
git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch
popd
From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Thu, 30 Nov 2017 18:51:49 +0100
Subject: [PATCH] Alternative implementation of horizontal_add on AVX512
---
vectorf512.h | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/vectorf512.h b/vectorf512.h
index 0845d12..6a15ac2 100644
--- a/vectorf512.h
+++ b/vectorf512.h
@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b)
// General arithmetic functions, etc.
+#if __GNUC__ < 7
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsd_f64 (__m512d __A)
+{
+ return __A[0];
+}
+#endif
// Horizontal add: Calculates the sum of all vector elements.
-static inline double horizontal_add (Vec8d const & a) {
-#if defined(__INTEL_COMPILER)
- return _mm512_reduce_add_pd(a);
-#else
- return horizontal_add(a.get_low() + a.get_high());
-#endif
+static inline double horizontal_add (Vec8d const & x) {
+ __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)));
+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2)));
+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4)));
+ return _mm512_cvtsd_f64(intermediate);
}
// function max: a > b ? a : b
--
2.1.4
From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Mon, 19 Jun 2017 13:07:22 +0200
Subject: [PATCH] Better implementation of horizontal_add
---
vectorf256.h | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/vectorf256.h b/vectorf256.h
index db509f8..2bbd9de 100644
--- a/vectorf256.h
+++ b/vectorf256.h
@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b)
// Horizontal add: Calculates the sum of all vector elements.
static inline double horizontal_add (Vec4d const & a) {
- __m256d t1 = _mm256_hadd_pd(a,a);
- __m128d t2 = _mm256_extractf128_pd(t1,1);
- __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
- return _mm_cvtsd_f64(t3);
+ const __m128d valupper = _mm256_extractf128_pd(a, 1);
+ const __m128d vallower = _mm256_castpd256_pd128(a);
+ const __m128d valval = _mm_add_pd(valupper, vallower);
+ const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval);
+ return _mm_cvtsd_f64(res);
}
// function max: a > b ? a : b
--
2.1.4
......@@ -24,6 +24,7 @@ add_subdirectory(test)
add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc)
target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing)
if(benchmark_FOUND)
target_link_libraries(_autotune_target benchmark)
find_package(Threads)
if(benchmark_FOUND AND Threads_FOUND)
target_link_libraries(_autotune_target benchmark Threads::Threads)
endif()
Subproject commit f411383630b272a3a5d3e28b82acaaa530a64723
Subproject commit ae1da38a47dd1f0cc256ffc8949d2b8bd51f1f1b
......@@ -10,3 +10,6 @@ import dune.codegen.loopy.symbolic # noqa
import dune.codegen.pdelab # noqa
import dune.codegen.sumfact # noqa
import dune.codegen.blockstructured # noqa
# Trigger imports that register finite elements in UFL
import dune.codegen.ufl.finiteelement # noqa
......@@ -7,6 +7,7 @@ from dune.codegen.pdelab.localoperator import determine_accumulation_space, Gene
from dune.codegen.pdelab.argument import name_accumulation_variable
from dune.codegen.pdelab.localoperator import boundary_predicates
from dune.codegen.generation.loopy import function_mangler, temporary_variable
from dune.codegen.tools import get_pymbolic_basename
import loopy as lp
import pymbolic.primitives as prim
......@@ -84,11 +85,15 @@ def generate_accumulation_instruction(expr, visitor):
if visitor.trial_info:
lfs_inames = lfs_inames.union(visitor.trial_info.inames)
deps = lp.symbolic.DependencyMapper()(accexpr)
deps = frozenset({Writes(get_pymbolic_basename(d)) for d in deps})
instruction(assignees=(),
expression=accexpr,
forced_iname_deps=lfs_inames.union(frozenset(quad_inames)),
forced_iname_deps_is_final=True,
predicates=predicates
predicates=predicates,
depends_on=deps,
)
......@@ -114,12 +119,15 @@ def generate_accumulation_instruction_vectorized(expr, visitor):
tuple(prim.Variable(i) for i in sub_element_inames() + lfs_inames))
expr_with_weight = prim.Product((expr, prim.Call(prim.Variable(accumvar + '.weight'), ())))
accum_expr = prim.Sum((expr_with_weight, assignee))
deps = lp.symbolic.DependencyMapper()(accum_expr)
deps = frozenset({Writes(get_pymbolic_basename(d)) for d in deps})
instruction(assignee=assignee,
expression=prim.Sum((expr_with_weight, assignee)),
expression=accum_expr,
forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)),
forced_iname_deps_is_final=True,
predicates=predicates,
tags=frozenset({'accum'}),
depends_on=frozenset({Writes(accumvar_alias)})
depends_on=deps
)
......@@ -22,7 +22,7 @@ from dune.codegen.pdelab.geometry import world_dimension, component_iname
from dune.codegen.pdelab.spaces import type_leaf_gfs, name_lfs
from dune.codegen.pdelab.restriction import restricted_name
from dune.codegen.blockstructured.spaces import lfs_inames
from dune.codegen.blockstructured.tools import tensor_index_to_sequential_index, sub_element_inames
from dune.codegen.blockstructured.tools import tensor_index_to_sequential_index, remove_sub_element_inames
from ufl import MixedElement
......@@ -48,12 +48,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
assert not isinstance(element, MixedElement)
name = "phi_{}".format(FEM_name_mangling(element))
name = restricted_name(name, restriction)
self.init_cache(element, 'Function')
self.init_basis_cache(element, restriction)
self.evaluate_basis(element, name, restriction)
inames = self.lfs_inames(element, restriction, number, context=context)
return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
@preamble(kernel='operator')
def init_basis_cache(self, element, restriction):
if not restriction:
cache = name_localbasis_cache(element)
localbasis = name_localbasis(element)
qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
"{",
" {}.evaluateFunction({}[i], {});".format(cache, qp_name, localbasis),
"}"]
else:
return []
@kernel_cached
def evaluate_basis(self, element, name, restriction):
temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1),
......@@ -61,7 +74,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
cache = name_localbasis_cache(element)
qp = self.to_cell(self.quadrature_position_in_micro())
localbasis = name_localbasis(element)
instruction(inames=self.quadrature_inames(),
instruction(inames=remove_sub_element_inames(self.quadrature_inames()),
code='{} = {}.evaluateFunction({}, {});'.format(name, cache, str(qp), localbasis),
assignees=frozenset({name}),
read_variables=frozenset({get_pymbolic_basename(qp)}),
......@@ -71,12 +84,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
assert not isinstance(element, MixedElement)
name = "js_{}".format(FEM_name_mangling(element))
name = restricted_name(name, restriction)
self.init_cache(element, 'Jacobian')
self.init_gradient_cache(element, restriction)
self.evaluate_reference_gradient(element, name, restriction)
inames = self.lfs_inames(element, restriction, number, context=context)
return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0))
@preamble(kernel='operator')
def init_gradient_cache(self, element, restriction):
if not restriction:
cache = name_localbasis_cache(element)
localbasis = name_localbasis(element)
qp_name = get_pymbolic_basename(self.quadrature_position_in_micro())
return ["for (int i=0; i < {}.size(); ++i)".format(qp_name),
"{",
" {}.evaluateJacobian({}[i], {});".format(cache, qp_name, localbasis),
"}"]
else:
return []
@kernel_cached
def evaluate_reference_gradient(self, element, name, restriction):
temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1, world_dimension()),
......@@ -84,7 +110,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
cache = name_localbasis_cache(element)
qp = self.to_cell(self.quadrature_position_in_micro())
localbasis = name_localbasis(element)
instruction(inames=self.quadrature_inames(),
instruction(inames=remove_sub_element_inames(self.quadrature_inames()),
code='{} = {}.evaluateJacobian({}, {});'.format(name, cache, str(qp), localbasis),
assignees=frozenset({name}),
read_variables=frozenset({get_pymbolic_basename(qp)}),
......@@ -116,7 +142,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True),
assignee=assignee,
forced_iname_deps=frozenset(self.quadrature_inames() + (dimindex,) + sub_element_inames()),
forced_iname_deps=frozenset(self.quadrature_inames() + (dimindex,)),
forced_iname_deps_is_final=True,
)
......@@ -143,7 +169,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin):
instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True),
assignee=assignee,
forced_iname_deps=frozenset(self.quadrature_inames() + sub_element_inames()),
forced_iname_deps=frozenset(self.quadrature_inames()),
forced_iname_deps_is_final=True,
)
......@@ -155,20 +181,11 @@ def typedef_localbasis(element, name):
r = type_floatingpoint()
dim = world_dimension()
if isPk(element):
if dim == 1:
include_file("dune/localfunctions/lagrange/pk1d/pk1dlocalbasis.hh", filetag="operatorfile")
basis_type = "Pk1DLocalBasis<{}, {}, {}>".format(df, r, element._degree)
elif dim == 2:
include_file("dune/localfunctions/lagrange/pk2d/pk2dlocalbasis.hh", filetag="operatorfile")
basis_type = "Pk2DLocalBasis<{}, {}, {}>".format(df, r, element._degree)
elif dim == 3:
include_file("dune/localfunctions/lagrange/pk3d/pk3dlocalbasis.hh", filetag="operatorfile")
basis_type = "Pk3DLocalBasis<{}, {}, {}>".format(df, r, element._degree)
else:
raise NotImplementedError("P{} in {}D is not implemented".format(element._degree, dim))
include_file("dune/localfunctions/lagrange/lagrangesimplex.hh", filetag="operatorfile")
basis_type = "Impl::LagrangeSimplexLocalBasis<{}, {}, {}, {}>".format(df, r, dim, element._degree)
elif isQk(element):
include_file("dune/localfunctions/lagrange/qk/qklocalbasis.hh", filetag="operatorfile")
basis_type = "QkLocalBasis<{}, {}, {}, {}>".format(df, r, element._degree, dim)
include_file("dune/localfunctions/lagrange/lagrangecube.hh", filetag="operatorfile")
basis_type = "Impl::LagrangeCubeLocalBasis<{}, {}, {}, {}>".format(df, r, dim, element._degree)
else:
raise NotImplementedError("Element type not known in code generation")
return "using {} = Dune::{};".format(name, basis_type)
......
......@@ -2,14 +2,18 @@ from dune.codegen.error import CodegenError
from dune.codegen.generation import quadrature_mixin, domain, iname
from dune.codegen.pdelab.geometry import world_dimension
from dune.codegen.pdelab.quadrature import GenericQuadratureMixin, quadrature_order
from dune.codegen.blockstructured.tools import name_point_in_macro
from dune.codegen.blockstructured.tools import name_point_in_macro, sub_element_inames, remove_sub_element_inames
import pymbolic.primitives as prim
@quadrature_mixin("blockstructured")
class BlockstructuredQuadratureMixin(GenericQuadratureMixin):
@staticmethod
def _subscript_without_sub_element_inames(s):
return prim.Subscript(s.aggregate, remove_sub_element_inames(s.index_tuple))
def quadrature_position_in_micro(self, index=None):
return GenericQuadratureMixin.quadrature_position(self, index)
return self._subscript_without_sub_element_inames(super().quadrature_position(index))
def quadrature_position_in_macro(self, index=None):
original = self.quadrature_position_in_micro(index)
......@@ -19,6 +23,9 @@ class BlockstructuredQuadratureMixin(GenericQuadratureMixin):
else:
return qp
def quadrature_weight(self, o):
return self._subscript_without_sub_element_inames(super().quadrature_weight(o))
def quadrature_position(self, index=None):
raise CodegenError('Decide if the quadrature point is in the macro or micro element')
......@@ -28,7 +35,7 @@ class BlockstructuredQuadratureMixin(GenericQuadratureMixin):
@iname
def quadrature_inames(self):
return (self.quadrature_iname(),)
return (self.quadrature_iname(),) + sub_element_inames()
def quadrature_bound():
......
......@@ -22,6 +22,11 @@ def sub_element_inames():
return inames
def remove_sub_element_inames(indices):
assert isinstance(indices, tuple)
return tuple(set(indices) - set(sub_element_inames()) - set(prim.Variable(i) for i in sub_element_inames()))
# compute sequential index for given tensor index, the same as index in base-k to base-10
def tensor_index_to_sequential_index(indices, k):
return prim.Sum(tuple(prim.Variable(index) * k ** i for i, index in enumerate(indices)))
......@@ -64,7 +69,7 @@ def define_point_in_macro(name, point_in_micro, visitor):
# TODO relax within inames
instruction(assignee=prim.Subscript(prim.Variable(name), (i,)),
expression=expr,
within_inames=frozenset(subelem_inames + visitor.quadrature_inames()),
within_inames=frozenset(visitor.quadrature_inames()),
tags=frozenset({subelem_inames[i]})
)
......
......@@ -40,7 +40,7 @@ def add_vcl_temporaries(knl, vcl_size):
from loopy.kernel.data import VectorizeTag
return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [init_domain],
temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries),
iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}),
iname_to_tags=dict(**knl.iname_to_tags, **{init_iname: frozenset({VectorizeTag()})}),
silenced_warnings=knl.silenced_warnings + silenced_warnings)
......
......@@ -23,3 +23,7 @@ class CodegenVectorizationError(CodegenCodegenError):
class CodegenAutotuneError(CodegenVectorizationError):
pass
class CodegenUnsupportedFiniteElementError(CodegenUFLError):
pass
......@@ -61,4 +61,4 @@ def register_liwkid_timer(name):
def dump_ssc_marks(name):
from dune.codegen.pdelab.driver.timings import get_region_marks
return 'std::cout << "{}: " << {} << " <--> " << {} << std::endl;'.format(name,
*get_region_marks(name, driver=False))
*get_region_marks(name, driver=False))
......@@ -17,8 +17,8 @@ silenced_warning = generator_factory(item_tags=("silenced_warning",), no_deco=Tr
kernel_cached = generator_factory(item_tags=("default_cached",), context_tags="kernel")
class DuneGlobalArg(lp.GlobalArg):
allowed_extra_kwargs = lp.GlobalArg.allowed_extra_kwargs + ["managed"]
class DuneGlobalArg(lp.ArrayArg):
allowed_extra_kwargs = lp.ArrayArg.allowed_extra_kwargs + ["managed"]
@generator_factory(item_tags=("argument", "globalarg"),
......@@ -29,7 +29,12 @@ def globalarg(name, shape=lp.auto, managed=True, **kw):
shape = (shape,)
from dune.codegen.loopy.target import dtype_floatingpoint
dtype = kw.pop("dtype", dtype_floatingpoint())
return DuneGlobalArg(name, dtype=dtype, shape=shape, managed=managed, **kw)
return DuneGlobalArg(name,
dtype=dtype,
shape=shape,
managed=managed,
address_space=lp.AddressSpace.GLOBAL,
**kw)
@generator_factory(item_tags=("argument", "constantarg"),
......
[loggers]
keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.vectorization
keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.transformations,dune.codegen.sumfact.vectorization
[handlers]
keys=consoleHandler
......@@ -16,6 +16,12 @@ handlers=consoleHandler
qualname=dune.codegen.pdelab.localoperator
propagate=0
[logger_dune.codegen.sumfact.transformations]
level=INFO
handlers=consoleHandler
qualname=dune.codegen.sumfact.transformations
propagate=0
[logger_dune.codegen.sumfact.vectorization]
level=INFO
handlers=consoleHandler
......
......@@ -29,8 +29,8 @@ class FusedMultiplyAdd(prim.Expression):
def __getinitargs__(self):
return (self.mul_op1, self.mul_op2, self.add_op)
def stringifier(self):
return lp.symbolic.StringifyMapper
def make_stringifier(self, originating_stringifier=None):
return lp.symbolic.StringifyMapper()
mapper_method = intern("map_fused_multiply_add")
......
from dune.codegen.options import get_form_option
from dune.codegen.sumfact.transformations import sumfact_performance_transformations
def performance_transformations(kernel, signature):
if get_form_option("sumfact"):
kernel = sumfact_performance_transformations(kernel, signature)
return kernel
import loopy as lp
import pymbolic.primitives as prim
def remove_reduction(knl, match):
"""Removes all mat