@article{so_dean_2013, title={Software thread integration for instruction-level parallelism}, volume={13}, DOI={10.1145/2501626.2512466}, abstractNote={Multimedia applications require a significantly higher level of performance than previous workloads of embedded systems. They have driven digital signal processor (DSP) makers to adopt high-performance architectures like VLIW (Very-Long Instruction Word). Despite many efforts to exploit instruction-level parallelism (ILP) in the application, the speed is a fraction of what it could be, limited by the difficulty of finding enough independent instructions to keep all of the processor's functional units busy.}, number={1}, journal={ACM Transactions on Embedded Computing Systems}, author={So, W. and Dean, A. G.}, year={2013} } @article{so_dean_2005, title={Complementing software pipelining with software thread integration}, volume={40}, ISSN={["1558-1160"]}, DOI={10.1145/1070891.1065930}, abstractNote={Software pipelining is a critical optimization for producing efficient code for VLIW/EPIC and superscalar processors in high-performance embedded applications such as digital signal processing. Software thread integration (STI) can often improve the performance of looping code in cases where software pipelining performs poorly or fails. This paper examines both situations, presenting methods to determine what and when to integrate.We evaluate our methods on C-language image and digital signal processing libraries and synthetic loop kernels. We compile them for a very long instruction word (VLIW) digital signal processor (DSP) -- the Texas Instruments (TI) C64x architecture. Loops which benefit little from software pipelining (SWP-Poor) speed up by 26% (harmonic mean, HM). Loops for which software pipelining fails (SWP-Fail) due to conditionals and calls speed up by 16% (HM). Combining SWP-Good and SWP-Poor loops leads to a speedup of 55% (HM).}, number={7}, journal={ACM SIGPLAN NOTICES}, author={So, W and Dean, AG}, year={2005}, month={Jul}, pages={137–146} }