- Presburger Arithmetic in Memory Access Optimization for Data-Parallel Languages - FroCoS 2013 2013
Karrenberg, R., Kosta, M. and Sturm, T.
Frontiers of Combining Systems, 2013.
[bib]
@CONFERENCE{KKS:2013:frocos,
author = {Ralf Karrenberg and Marek Kosta and Thomas Sturm},
title = {Presburger Arithmetic in Memory Access Optimization for Data-Parallel Languages},
booktitle = {Frontiers of Combining Systems},
booktitle_short = {FroCoS 2013},
year = {2013},
}
- Improving Performance of OpenCL on CPUs - CC 2012
Karrenberg, R. and Hack, S.
Compiler Construction, 2012.
[url]
[bib]
@CONFERENCE{KH:2012:opencl,
author = {Ralf Karrenberg and Sebastian Hack},
title = {Improving Performance of OpenCL on CPUs},
booktitle = {Compiler Construction},
booktitle_short = {CC},
year = {2012},
url = {http://www.cdl.uni-saarland.de/papers/karrenberg_opencl.pdf}
}
- Whole Function Vectorization
Karrenberg, R. and Hack, S.
International Symposium on Code Generation and Optimization, 2011.
[doi]
[url]
[slides]
[bib]
@CONFERENCE{KH:2011:cgo,
author = {Ralf Karrenberg and Sebastian Hack},
title = {{W}hole {F}unction {V}ectorization},
booktitle = {International Symposium on Code Generation and Optimization},
series = {CGO},
year = {2011},
doi = {10.1109/CGO.2011.5764682},
abstract = {
Abstract—Data-parallel programming languages are an important component
in today's parallel computing landscape. Among those are domain-
specific languages like shading languages in graphics (HLSL, GLSL,
RenderMan, etc.) and "general-purpose" languages like CUDA or OpenCL.
Current implementations of those languages on CPUs solely rely on multi-
threading to implement parallelism and ignore the additional intra-core
parallelism provided by the SIMD instruction set of those processors
(like Intel's SSE and the upcoming AVX or Larrabee instruction sets).
In this paper, we discuss several aspects of implementing data-parallel
languages on machines with SIMD instruction sets. Our main contribution
is a language- and platform-independent code transformation that
performs whole-function vectorization on low-level intermediate code
given by a control flow graph in SSA form.
We evaluate our technique in two scenarios: First, incorporated in a
compiler for a domain-specific language used in real-time ray tracing.
Second, in a stand-alone OpenCL driver. We observe average speedup
factors of 3.9 for the ray tracer and factors between 0.6 and 5.2 for
different OpenCL kernels.
},
webslides = {http://www.cdl.uni-saarland.de/projects/wfv/wfv_cgo11_slides.pdf},
url = {http://www.cdl.uni-saarland.de/papers/karrenberg_wfv.pdf},
acc_rate = {26.7},
accepted = {28},
submitted = {105},
}
- AnySL: Efficient and Portable Shading for Ray Tracing - HPG 2010
Karrenberg, R., Rubinstein, D., Slusallek, P. and Hack, S.
Proceedings of the Conference on High Performance Graphics, pages 97–105, Eurographics Association, 2010.
[url]
[slides]
[bib]
@CONFERENCE{KRSH:2010:hpg,
author = {Ralf Karrenberg and Dmitri Rubinstein and Philipp Slusallek and Sebastian Hack},
title = {{AnySL: Efficient and Portable Shading for Ray Tracing}},
booktitle = {Proceedings of the Conference on High Performance Graphics},
series = {HPG '10},
year = {2010},
location = {Saarbrucken, Germany},
pages = {97--105},
numpages = {9},
url = {http://portal.acm.org/citation.cfm?id=1921479.1921495},
acmid = {1921495},
publisher = {Eurographics Association},
address = {Aire-la-Ville, Switzerland, Switzerland},
booktitle_short = {HPG},
abstract = {
While a number of different shading languages have been developed,
their efficient integration into an existing renderer is notoriously
difficult, often boiling down to implementing an entire compiler
toolchain for each language. Furthermore, no shading language is
broadly supported across the variety of rendering systems.
AnySL attacks this issue from multiple directions: We compile shaders
from different languages into a common, portable representation, which
uses subroutine threaded code: Every language operator is translated to
a function call. Thus, the compiled shader is generic with respect to
the used types and operators.
The key component of our system is an embedded compiler that
instantiates this generic code in terms of the renderer's native types
and operations. It allows for flexible code transformations to match
the internal structure of the renderer and eliminates all overhead due
to the subroutine threaded code. For SIMD architectures we
automatically perform vectorization of scalar shaders which speeds up
rendering by a factor of 3.9 on average on SSE. The results are highly
optimized, parallel shaders that operate directly on the internal data
structures of a renderer. We show that both traditional shading
languages such as RenderMan, but also C/C++-based shading languages,
can be fully supported and deliver high performance across different
CPU renderers.
},
webslides = {http://www.cdl.uni-saarland.de/projects/anysl/anysl_hpg10_slides.pdf}
}