From 2bae92dc5e4fa45597692f96ad87c7f927350446 Mon Sep 17 00:00:00 2001 From: sheaf Date: Thu, 29 Aug 2024 00:26:19 +0200 Subject: [PATCH] more SIMD experiments --- MetaBrush.cabal | 2 +- brush-strokes/brush-strokes.cabal | 16 +++-- brush-strokes/cabal.project | 39 ------------ brush-strokes/cbits/mul.S | 30 +++++++++ brush-strokes/cbits/rounding.c | 9 +++ brush-strokes/src/cusps/bench/Main.hs | 7 +++ .../src/lib/Math/Interval/Internal/SIMD.hs | 62 ++++++++----------- cabal.project | 5 +- 8 files changed, 87 insertions(+), 83 deletions(-) delete mode 100644 brush-strokes/cabal.project create mode 100644 brush-strokes/cbits/mul.S create mode 100644 brush-strokes/cbits/rounding.c diff --git a/MetaBrush.cabal b/MetaBrush.cabal index 5893218..69eb3e2 100644 --- a/MetaBrush.cabal +++ b/MetaBrush.cabal @@ -161,7 +161,7 @@ common gtk , haskell-gi >= 0.26.10 && < 0.27 , haskell-gi-base - >= 0.26.10 && < 0.27 + >= 0.26.6 && < 0.27 -- Workaround for https://github.com/haskell/cabal/issues/4237 -- See https://github.com/commercialhaskell/stack/issues/2197 diff --git a/brush-strokes/brush-strokes.cabal b/brush-strokes/brush-strokes.cabal index 5beaaf9..3fef7fe 100644 --- a/brush-strokes/brush-strokes.cabal +++ b/brush-strokes/brush-strokes.cabal @@ -10,11 +10,11 @@ description: Computing brush strokes using BΓ©zier curves. extra-source-files: - cbits/**/*.{cpp, hpp} + cbits/**/*.{S,c,h,cpp,hpp} flag use-simd description: Use SIMD instructions to implement interval arithmetic. - default: True + default: False manual: True flag use-fma @@ -106,7 +106,7 @@ common common cpp-options: -DUSE_SIMD ghc-options: - -mavx2 + -mavx build-depends: base >= 4.20 @@ -213,15 +213,21 @@ library , rounded-hw ^>= 0.4 , time - ^>= 1.12.2 + >= 1.12.2 && < 1.15 , transformers >= 0.5.6.2 && < 0.7 - -- Extra C++ code for 2D linear systems of interval equations include-dirs: cbits + c-sources: + -- C code for: setting the rounding mode + cbits/rounding.c cxx-sources: + -- Walter's C++ code for 2D linear systems of interval equations cbits/lp_2d.cpp + asm-sources: + -- X86 assembly for interval multiplication using SIMD instructions + cbits/mul.S cxx-options: -std=c++20 -mavx2 diff --git a/brush-strokes/cabal.project b/brush-strokes/cabal.project deleted file mode 100644 index d4e54c1..0000000 --- a/brush-strokes/cabal.project +++ /dev/null @@ -1,39 +0,0 @@ -packages: . - -constraints: - acts -finitary, - fp-ieee +fma3, - rounded-hw -pure-hs -c99 -avx512 +ghc-prim -x87-long-double - -tests: True - -allow-newer: - *:base, *:template-haskell, - acts:deepseq, - falsify:tasty, - indexed-traversable:containers - --------------- --- GHC 9.10 -- --------------- - --- eigen -source-repository-package - type: git - location: https://github.com/sheaf/eigen - tag: 3bc1d4bc53edb75e2ee3893763361aa0d5c7714a - --------------- --- GHC HEAD -- --------------- - -repository head.hackage.ghc.haskell.org - url: https://ghc.gitlab.haskell.org/head.hackage/ - secure: True - key-threshold: 3 - root-keys: - 7541f32a4ccca4f97aea3b22f5e593ba2c0267546016b992dfadcd2fe944e55d - 26021a13b401500c8eb2761ca95c61f2d625bfef951b939a8124ed12ecf07329 - f76d08be13e9a61a377a85e2fb63f4c5435d40f8feb3e12eb05905edb8cdea89 - -active-repositories: hackage.haskell.org, head.hackage.ghc.haskell.org:override diff --git a/brush-strokes/cbits/mul.S b/brush-strokes/cbits/mul.S new file mode 100644 index 0000000..1abee26 --- /dev/null +++ b/brush-strokes/cbits/mul.S @@ -0,0 +1,30 @@ +.global in2_mul8 +in2_mul8: + vshufpd $3, %xmm1, %xmm1, %xmm4 + vshufpd $3, %xmm2, %xmm2, %xmm3 + vxorpd %xmm1, %xmm4, %xmm5 + vxorpd %xmm2, %xmm3, %xmm6 + vtestpd %xmm5, %xmm6 + je .LBB1_2 + vxorpd %xmm5, %xmm5, %xmm5 + vcmplepd %xmm5, %xmm3, %xmm3 + vcmplepd %xmm5, %xmm4, %xmm4 + vmovsd .LCPI1_1(%rip), %xmm5 + vxorpd %xmm5, %xmm1, %xmm1 + vshufps $78, %xmm1, %xmm1, %xmm6 + vblendvpd %xmm3, %xmm6, %xmm1, %xmm1 + vxorpd %xmm5, %xmm2, %xmm2 + vshufps $78, %xmm2, %xmm2, %xmm3 + vblendvpd %xmm4, %xmm3, %xmm2, %xmm2 + vxorpd %xmm5, %xmm2, %xmm2 + vmulpd %xmm2, %xmm1, %xmm1 + jmp *(%rbp) +.LCPI1_1: + .quad -9223372036854775808 +.LBB1_2: + vshufpd $1, %xmm1, %xmm1, %xmm4 + vmovddup %xmm2, %xmm2 + vmulpd %xmm4, %xmm2, %xmm2 + vmulpd %xmm1, %xmm3, %xmm1 + vmaxpd %xmm1, %xmm2, %xmm1 + jmp *(%rbp) diff --git a/brush-strokes/cbits/rounding.c b/brush-strokes/cbits/rounding.c new file mode 100644 index 0000000..04f6b21 --- /dev/null +++ b/brush-strokes/cbits/rounding.c @@ -0,0 +1,9 @@ +#include + +// Function to set the CSR rounding mode for SSE operations +void set_sse_rounding_mode(unsigned int rounding_mode) { + unsigned int csr = _mm_getcsr(); + csr &= ~_MM_ROUND_MASK; // Clear existing rounding mode bits + csr |= rounding_mode; // Set new rounding mode + _mm_setcsr(csr); +} diff --git a/brush-strokes/src/cusps/bench/Main.hs b/brush-strokes/src/cusps/bench/Main.hs index d9fd97d..5a15b3b 100644 --- a/brush-strokes/src/cusps/bench/Main.hs +++ b/brush-strokes/src/cusps/bench/Main.hs @@ -51,8 +51,15 @@ import Bench.Types -------------------------------------------------------------------------------- +pattern ROUND_UP :: Word +pattern ROUND_UP = 0x4000 + +foreign import ccall "set_sse_rounding_mode" + setSSERoundingMode :: Word -> IO () + main :: IO () main = withCP65001 $ do + setSSERoundingMode ROUND_UP putStrLn "Running cusp-finding benchmarks." for_ benchGroups $ \ ( groupName, benchGroupCases ) -> do putStrLn $ unlines diff --git a/brush-strokes/src/lib/Math/Interval/Internal/SIMD.hs b/brush-strokes/src/lib/Math/Interval/Internal/SIMD.hs index 15f3267..7d6c89c 100644 --- a/brush-strokes/src/lib/Math/Interval/Internal/SIMD.hs +++ b/brush-strokes/src/lib/Math/Interval/Internal/SIMD.hs @@ -1,4 +1,6 @@ {-# LANGUAGE CPP #-} +{-# LANGUAGE GHCForeignImportPrim #-} +{-# LANGUAGE UnliftedFFITypes #-} module Math.Interval.Internal.SIMD ( 𝕀(𝕀, inf, sup) @@ -12,7 +14,7 @@ import qualified Prelude import GHC.Exts ( isTrue# , Double(D#), Double# - , (<=##), (*##), negateDouble# + , (<=##), negateDouble# , DoubleX2# , packDoubleX2#, unpackDoubleX2# , broadcastDoubleX2# @@ -40,26 +42,31 @@ import Math.Ring -- | A non-empty interval of real numbers (possibly unbounded). data 𝕀 = MkI DoubleX2# - -- Interval [lo..hi] represented as (hi, -lo) + -- Interval [lo..hi] represented as (-lo, hi) + +foreign import prim "in2_mul8" + mulI :: DoubleX2# -> DoubleX2# -> DoubleX2# instance NFData 𝕀 where rnf ( MkI !_ ) = () {-# COMPLETE PackI #-} +{-# INLINE PackI #-} pattern PackI :: Double# -> Double# -> 𝕀 -pattern PackI hi m_lo <- ( ( \ ( MkI x ) -> unpackDoubleX2# x ) -> (# hi, m_lo #) ) +pattern PackI m_lo hi <- ( ( \ ( MkI x ) -> unpackDoubleX2# x ) -> (# m_lo, hi #) ) where - PackI hi m_lo = MkI ( packDoubleX2# (# hi, m_lo #) ) + PackI m_lo hi = MkI ( packDoubleX2# (# m_lo, hi #) ) {-# COMPLETE 𝕀 #-} +{-# INLINE 𝕀 #-} pattern 𝕀 :: Double -> Double -> 𝕀 -pattern 𝕀 { inf, sup } <- ( ( \ ( PackI hi m_lo ) -> (# D# ( negateDouble# m_lo ), D# hi #) ) +pattern 𝕀 { inf, sup } <- ( ( \ ( PackI m_lo hi ) -> (# D# ( negateDouble# m_lo ), D# hi #) ) -> (# inf, sup #) ) where 𝕀 (D# lo) (D# hi) = let m_lo = negateDouble# lo - in PackI hi m_lo + in PackI m_lo hi deriving via ViaPrelude 𝕀 instance AbelianGroup 𝕀 @@ -73,56 +80,37 @@ instance Prelude.Num 𝕀 where fromInteger i = let !j = Prelude.fromInteger i in 𝕀 j j - abs x@(PackI hi m_lo) + abs x@(PackI m_lo hi) | D# m_lo <= 0 - = PackI hi m_lo + = PackI m_lo hi | D# hi <= 0 = Prelude.negate x | otherwise - = 𝕀 0 (max (D# m_lo) (D# hi)) + = 𝕀 0 ( D# ( maxDouble# m_lo hi ) ) signum _ = error "No implementation of signum for intervals" instance Ring 𝕀 where - ( PackI b ma ) * ( PackI d mc ) = - case ( zeroPos b ma, zeroPos d mc ) of - -- From "Fast and Correct SIMD Algorithms for Interval Arithmetic" - -- FrΓ©dΓ©ric Goualard, 2008 - ( LT, LT ) -> PackI (ma *## mc) ( ( negateDouble# b ) *## d) - ( LT, EQ ) -> PackI (ma *## mc) ( ma *## d ) - ( LT, GT ) -> PackI ( b *## ( negateDouble# mc ) ) ( ma *## d ) - ( EQ, LT ) -> PackI (ma *## mc) ( b *## mc ) - ( EQ, EQ ) -> PackI - ( maxDouble# ( ma *## d ) ( b *## mc ) ) - ( maxDouble# ( ma *## mc ) ( b *## d ) ) - ( EQ, GT ) -> PackI ( b *## d ) ( ma *## d ) - ( GT, LT ) -> PackI ( ( negateDouble# ma ) *## d ) ( b *## mc ) - ( GT, EQ ) -> PackI ( b *## d ) ( b *## mc ) - ( GT, GT ) -> PackI ( b *## d ) ( ( negateDouble# ma ) *## mc ) - where - zeroPos :: Double# -> Double# -> Ordering - zeroPos hi m_lo - | isTrue# (hi <=## 0.0##) - = LT - | isTrue# (m_lo <=## 0.0##) - = GT - | otherwise - = EQ + ( MkI x ) * ( MkI y ) = error "TODO: Cabal bug #5623 if used in TH" --MkI ( mulI x y ) + -- This uses the C code from Lockless Inc. + -- TODO: compare with + -- "Fast and Correct SIMD Algorithms for Interval Arithmetic" + -- FrΓ©dΓ©ric Goualard, 2008 iv ^ 1 = iv - PackI hi m_lo ^ n + PackI m_lo hi ^ n | odd n || isTrue# (m_lo <=## 0.0##) , let !(D# m_lo') = D# m_lo Prelude.^ n !(D# hi' ) = D# hi Prelude.^ n - = PackI hi' m_lo' + = PackI m_lo' hi' | isTrue# (hi <=## 0.0##) , let !(D# m_lo') = (D# hi ) Prelude.^ n !(D# hi') = (D# m_lo) Prelude.^ n - = PackI hi' m_lo' + = PackI m_lo' hi' | otherwise , let !(D# hi1) = (D# m_lo) Prelude.^ n !(D# hi2) = (D# hi ) Prelude.^ n hi' = maxDouble# hi1 hi2 - = PackI hi' 0.0## + = PackI 0.0## hi' instance Prelude.Fractional 𝕀 where fromRational r = diff --git a/cabal.project b/cabal.project index 6cfca0a..d78a798 100644 --- a/cabal.project +++ b/cabal.project @@ -16,18 +16,21 @@ source-repository-package tag: ec171cd5d185309692b745e2e2f291eab4038fb9 allow-newer: - *:base, *:template-haskell, *:ghc-prim, + *:base, *:template-haskell, *:ghc-prim, *:Cabal, acts:deepseq, digit:lens, eigen:primitive, eigen:transformers, + falsify:tasty, gi-cairo-connector:mtl, hedgehog:containers, hedgehog:resourcet, + indexed-traversable:containers, JuicyPixels:zlib, natural:lens, natural:semigroupoids, records-sop:deepseq, + th-abstraction:containers, waargonaut:bifunctors, waargonaut:bytestring, waargonaut:containers,