New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
20x slowdown when using Int32
instead of Int16
#54282
Comments
I tried this on few different cpus and architectures, and I get some weird behaviours. Apple Silicon (no change)1.10.2:
1.11.0-beta1:
Raspberry Pi 5 (no change)1.10.2:
1.11.0-beta1:
Intel 11900 (same as 10th gen Intel from original post)1.10.2:
1.11.0-beta1:
Intel 12900 (same pattern as post but 3x slowdown)1.10.2:
1.11.0-beta1:
Ryzen 3700X (doesn't like f1)1.10.2:
1.11.0-beta1:
|
The optimization level seems to play a role. For Julia 1.10.2 and my Skylake machine I get:
Could it be that some Generally speaking, I would expect |
LLVM IR for f1 on intel 12900: https://godbolt.org/z/bxfhvbP67 Int16: ; @ REPL[2]:1 within `f1`
define void @julia_f1_270([8 x i16]* noalias nocapture noundef nonnull sret([8 x i16]) align 2 dereferenceable(16) %0, [8 x i16]* nocapture noundef nonnull readonly align 2 dereferenceable(16) %1, {}* noundef nonnull align 16 dereferenceable(40) %2) #0 {
top:
; @ REPL[2]:2 within `f1`
; ┌ @ array.jl:945 within `iterate` @ array.jl:945
; │┌ @ essentials.jl:10 within `length`
%3 = bitcast {}* %2 to { i8*, i64, i16, i16, i32 }*
%arraylen_ptr = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %3, i64 0, i32 1
%arraylen = load i64, i64* %arraylen_ptr, align 8
; │└
; │┌ @ int.jl:520 within `<` @ int.jl:513
%.not = icmp eq i64 %arraylen, 0
; │└
br i1 %.not, label %guard_pass43, label %guard_exit29
L19: ; preds = %guard_exit34, %guard_exit29
%value_phi3.in = phi i64* [ %arrayptr106, %guard_exit29 ], [ %17, %guard_exit34 ]
%value_phi4 = phi i64 [ 2, %guard_exit29 ], [ %18, %guard_exit34 ]
; └
; @ REPL[2]:4 within `f1`
%4 = phi <8 x i16> [ %14, %guard_exit29 ], [ %9, %guard_exit34 ]
; @ REPL[2] within `f1`
%value_phi3 = load i64, i64* %value_phi3.in, align 8
; @ REPL[2]:3 within `f1`
; ┌ @ int.jl:117 within `isodd`
; │┌ @ number.jl:42 within `iszero`
; ││┌ @ promotion.jl:521 within `==`
%5 = and i64 %value_phi3, 1
; └└└
; ┌ @ REPL[1]:8 within `inc_index`
; │┌ @ broadcast.jl:903 within `materialize`
; ││┌ @ broadcast.jl:1118 within `copy`
; │││┌ @ ntuple.jl:69 within `ntuple`
; ││││┌ @ ntuple.jl:72 within `macro expansion`
; │││││┌ @ broadcast.jl:1118 within `#31`
; ││││││┌ @ broadcast.jl:681 within `_broadcast_getindex`
; │││││││┌ @ broadcast.jl:705 within `_getindex` @ broadcast.jl:706
; ││││││││┌ @ broadcast.jl:660 within `_broadcast_getindex`
; │││││││││┌ @ tuple.jl:31 within `getindex`
%6 = getelementptr inbounds [8 x [8 x i16]], [8 x [8 x i16]]* @_j_const1, i64 0, i64 %5, i64 0
; │││││││└└└
; │││││││ @ broadcast.jl:682 within `_broadcast_getindex`
; │││││││┌ @ broadcast.jl:709 within `_broadcast_getindex_evalf`
; ││││││││┌ @ int.jl:87 within `+`
%7 = bitcast i16* %6 to <8 x i16>*
%8 = load <8 x i16>, <8 x i16>* %7, align 2
%9 = add <8 x i16> %8, %4
; └└└└└└└└└
; @ REPL[2]:4 within `f1`
; ┌ @ array.jl:945 within `iterate`
; │┌ @ int.jl:520 within `<` @ int.jl:513
%exitcond.not = icmp eq i64 %value_phi4, %15
; │└
br i1 %exitcond.not, label %L72, label %guard_exit34
L72: ; preds = %guard_pass43, %L19
; └
%10 = phi <8 x i16> [ %20, %guard_pass43 ], [ %9, %L19 ]
%11 = bitcast [8 x i16]* %0 to <8 x i16>*
store <8 x i16> %10, <8 x i16>* %11, align 2
ret void
guard_exit29: ; preds = %top
; @ REPL[2]:2 within `f1`
; ┌ @ array.jl:945 within `iterate` @ array.jl:945
; │┌ @ essentials.jl:13 within `getindex`
%12 = bitcast {}* %2 to i64**
%arrayptr106 = load i64*, i64** %12, align 8
; └└
; @ REPL[2]:4 within `f1`
%13 = bitcast [8 x i16]* %1 to <8 x i16>*
%14 = load <8 x i16>, <8 x i16>* %13, align 2
%15 = add nuw nsw i64 %arraylen, 1
br label %L19
guard_exit34: ; preds = %L19
; ┌ @ array.jl:945 within `iterate`
; │┌ @ int.jl:1068 within `-` @ int.jl:86
%16 = add nsw i64 %value_phi4, -1
; │└
; │┌ @ essentials.jl:13 within `getindex`
%17 = getelementptr inbounds i64, i64* %arrayptr106, i64 %16
; │└
; │┌ @ int.jl:87 within `+`
%18 = add nuw i64 %value_phi4, 1
; └└
br label %L19
guard_pass43: ; preds = %top
%19 = bitcast [8 x i16]* %1 to <8 x i16>*
%20 = load <8 x i16>, <8 x i16>* %19, align 2
br label %L72
} Int32: ; @ REPL[2]:1 within `f1`
define void @julia_f1_272([8 x i32]* noalias nocapture noundef nonnull sret([8 x i32]) align 4 dereferenceable(32) %0, [8 x i32]* nocapture noundef nonnull readonly align 4 dereferenceable(32) %1, {}* noundef nonnull align 16 dereferenceable(40) %2) #0 {
top:
; @ REPL[2]:2 within `f1`
; ┌ @ array.jl:945 within `iterate` @ array.jl:945
; │┌ @ essentials.jl:10 within `length`
%3 = bitcast {}* %2 to { i8*, i64, i16, i16, i32 }*
%arraylen_ptr = getelementptr inbounds { i8*, i64, i16, i16, i32 }, { i8*, i64, i16, i16, i32 }* %3, i64 0, i32 1
%arraylen = load i64, i64* %arraylen_ptr, align 8
; │└
; │┌ @ int.jl:520 within `<` @ int.jl:513
%.not = icmp eq i64 %arraylen, 0
; │└
br i1 %.not, label %guard_pass43, label %guard_exit29
L19: ; preds = %guard_exit34, %scalar.ph
%value_phi3.in = phi i64* [ %bc.resume.val, %scalar.ph ], [ %68, %guard_exit34 ]
%value_phi4 = phi i64 [ %bc.resume.val203, %scalar.ph ], [ %69, %guard_exit34 ]
; └
; @ REPL[2]:4 within `f1`
%4 = phi <8 x i32> [ %66, %scalar.ph ], [ %9, %guard_exit34 ]
; @ REPL[2] within `f1`
%value_phi3 = load i64, i64* %value_phi3.in, align 8
; @ REPL[2]:3 within `f1`
; ┌ @ int.jl:117 within `isodd`
; │┌ @ number.jl:42 within `iszero`
; ││┌ @ promotion.jl:521 within `==`
%5 = and i64 %value_phi3, 1
; └└└
; ┌ @ REPL[1]:8 within `inc_index`
; │┌ @ broadcast.jl:903 within `materialize`
; ││┌ @ broadcast.jl:1118 within `copy`
; │││┌ @ ntuple.jl:69 within `ntuple`
; ││││┌ @ ntuple.jl:72 within `macro expansion`
; │││││┌ @ broadcast.jl:1118 within `#31`
; ││││││┌ @ broadcast.jl:681 within `_broadcast_getindex`
; │││││││┌ @ broadcast.jl:705 within `_getindex` @ broadcast.jl:706
; ││││││││┌ @ broadcast.jl:660 within `_broadcast_getindex`
; │││││││││┌ @ tuple.jl:31 within `getindex`
%6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, i64 %5, i64 0
; │││││││└└└
; │││││││ @ broadcast.jl:682 within `_broadcast_getindex`
; │││││││┌ @ broadcast.jl:709 within `_broadcast_getindex_evalf`
; ││││││││┌ @ int.jl:87 within `+`
%7 = bitcast i32* %6 to <8 x i32>*
%8 = load <8 x i32>, <8 x i32>* %7, align 4
%9 = add <8 x i32> %8, %4
; └└└└└└└└└
; @ REPL[2]:4 within `f1`
; ┌ @ array.jl:945 within `iterate`
; │┌ @ int.jl:520 within `<` @ int.jl:513
%exitcond.not = icmp eq i64 %value_phi4, %15
; │└
br i1 %exitcond.not, label %L72, label %guard_exit34
L72: ; preds = %guard_pass43, %L19
; └
%10 = phi <8 x i32> [ %71, %guard_pass43 ], [ %9, %L19 ]
%11 = bitcast [8 x i32]* %0 to <8 x i32>*
store <8 x i32> %10, <8 x i32>* %11, align 4
ret void
guard_exit29: ; preds = %top
; @ REPL[2]:2 within `f1`
; ┌ @ array.jl:945 within `iterate` @ array.jl:945
; │┌ @ essentials.jl:13 within `getindex`
%12 = bitcast {}* %2 to i64**
%arrayptr106 = load i64*, i64** %12, align 8
; └└
; @ REPL[2]:4 within `f1`
%13 = bitcast [8 x i32]* %1 to <8 x i32>*
%14 = load <8 x i32>, <8 x i32>* %13, align 4
%15 = add nuw nsw i64 %arraylen, 1
%min.iters.check = icmp ult i64 %arraylen, 5
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %guard_exit29
%n.mod.vf = and i64 %arraylen, 3
%16 = icmp eq i64 %n.mod.vf, 0
%17 = select i1 %16, i64 4, i64 %n.mod.vf
%n.vec = sub nsw i64 %arraylen, %17
%ind.end = getelementptr i64, i64* %arrayptr106, i64 %n.vec
%ind.end202 = add i64 %n.vec, 2
%18 = shufflevector <8 x i32> %14, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%19 = shufflevector <4 x i32> %18, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
%20 = shufflevector <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> %18, <4 x i32> <i32 5, i32 1, i32 2, i32 3>
%21 = shufflevector <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> %18, <4 x i32> <i32 6, i32 1, i32 2, i32 3>
%22 = shufflevector <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> %18, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
%23 = shufflevector <8 x i32> %14, <8 x i32> poison, <4 x i32> <i32 4, i32 undef, i32 undef, i32 undef>
%24 = shufflevector <4 x i32> %23, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
%25 = shufflevector <8 x i32> %14, <8 x i32> poison, <4 x i32> <i32 5, i32 undef, i32 undef, i32 undef>
%26 = shufflevector <4 x i32> %25, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
%27 = shufflevector <8 x i32> %14, <8 x i32> poison, <4 x i32> <i32 6, i32 undef, i32 undef, i32 undef>
%28 = shufflevector <4 x i32> %27, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
%29 = shufflevector <8 x i32> %14, <8 x i32> poison, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
%30 = shufflevector <4 x i32> %29, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i32> [ %19, %vector.ph ], [ %34, %vector.body ]
%vec.phi204 = phi <4 x i32> [ %20, %vector.ph ], [ %36, %vector.body ]
%vec.phi205 = phi <4 x i32> [ %21, %vector.ph ], [ %38, %vector.body ]
%vec.phi206 = phi <4 x i32> [ %22, %vector.ph ], [ %40, %vector.body ]
%vec.phi207 = phi <4 x i32> [ %24, %vector.ph ], [ %42, %vector.body ]
%vec.phi208 = phi <4 x i32> [ %26, %vector.ph ], [ %44, %vector.body ]
%vec.phi209 = phi <4 x i32> [ %28, %vector.ph ], [ %46, %vector.body ]
%vec.phi210 = phi <4 x i32> [ %30, %vector.ph ], [ %48, %vector.body ]
%next.gep = getelementptr i64, i64* %arrayptr106, i64 %index
; @ REPL[2] within `f1`
%31 = bitcast i64* %next.gep to <4 x i64>*
%wide.load = load <4 x i64>, <4 x i64>* %31, align 8
; @ REPL[2]:3 within `f1`
; ┌ @ int.jl:117 within `isodd`
; │┌ @ number.jl:42 within `iszero`
; ││┌ @ promotion.jl:521 within `==`
%32 = and <4 x i64> %wide.load, <i64 1, i64 1, i64 1, i64 1>
%33 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 0
; └└└
; ┌ @ REPL[1]:8 within `inc_index`
; │┌ @ broadcast.jl:903 within `materialize`
; ││┌ @ broadcast.jl:1118 within `copy`
; │││┌ @ ntuple.jl:69 within `ntuple`
; ││││┌ @ ntuple.jl:72 within `macro expansion`
; │││││┌ @ broadcast.jl:1118 within `#31`
; ││││││┌ @ broadcast.jl:682 within `_broadcast_getindex`
; │││││││┌ @ broadcast.jl:709 within `_broadcast_getindex_evalf`
; ││││││││┌ @ int.jl:87 within `+`
%wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %33, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%34 = add <4 x i32> %wide.masked.gather, %vec.phi
%35 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 1
%wide.masked.gather211 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %35, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%36 = add <4 x i32> %wide.masked.gather211, %vec.phi204
%37 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 2
%wide.masked.gather212 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %37, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%38 = add <4 x i32> %wide.masked.gather212, %vec.phi205
%39 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 3
%wide.masked.gather213 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %39, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%40 = add <4 x i32> %wide.masked.gather213, %vec.phi206
%41 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 4
%wide.masked.gather214 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %41, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%42 = add <4 x i32> %wide.masked.gather214, %vec.phi207
%43 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 5
%wide.masked.gather215 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %43, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%44 = add <4 x i32> %wide.masked.gather215, %vec.phi208
%45 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 6
%wide.masked.gather216 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %45, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%46 = add <4 x i32> %wide.masked.gather216, %vec.phi209
%47 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @_j_const1, i64 0, <4 x i64> %32, i64 7
%wide.masked.gather217 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %47, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
%48 = add <4 x i32> %wide.masked.gather217, %vec.phi210
%index.next = add nuw i64 %index, 4
%49 = icmp eq i64 %index.next, %n.vec
br i1 %49, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; └└└└└└└└└
; @ REPL[2]:4 within `f1`
%50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %48)
%51 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %46)
%52 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44)
%53 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %42)
%54 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %40)
%55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %38)
%56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %36)
%57 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34)
%58 = insertelement <8 x i32> poison, i32 %57, i64 0
%59 = insertelement <8 x i32> %58, i32 %56, i64 1
%60 = insertelement <8 x i32> %59, i32 %55, i64 2
%61 = insertelement <8 x i32> %60, i32 %54, i64 3
%62 = insertelement <8 x i32> %61, i32 %53, i64 4
%63 = insertelement <8 x i32> %62, i32 %52, i64 5
%64 = insertelement <8 x i32> %63, i32 %51, i64 6
%65 = insertelement <8 x i32> %64, i32 %50, i64 7
br label %scalar.ph
scalar.ph: ; preds = %middle.block, %guard_exit29
%bc.resume.val = phi i64* [ %ind.end, %middle.block ], [ %arrayptr106, %guard_exit29 ]
%bc.resume.val203 = phi i64 [ %ind.end202, %middle.block ], [ 2, %guard_exit29 ]
%66 = phi <8 x i32> [ %65, %middle.block ], [ %14, %guard_exit29 ]
br label %L19
guard_exit34: ; preds = %L19
; ┌ @ array.jl:945 within `iterate`
; │┌ @ int.jl:1068 within `-` @ int.jl:86
%67 = add nsw i64 %value_phi4, -1
; │└
; │┌ @ essentials.jl:13 within `getindex`
%68 = getelementptr inbounds i64, i64* %arrayptr106, i64 %67
; │└
; │┌ @ int.jl:87 within `+`
%69 = add nuw i64 %value_phi4, 1
; └└
br label %L19
guard_pass43: ; preds = %top
%70 = bitcast [8 x i32]* %1 to <8 x i32>*
%71 = load <8 x i32>, <8 x i32>* %70, align 4
br label %L72
} |
Since this is a very CPU-dependent issue (I tried a few different processors, both x86_64 and aarch64, and couldn't reproduce the slowdown), maybe this is related to the LLVM backend? You might also want to try Julia nightly, which now has LLVM 17. |
@giordano same results for nightly. A bunch of extra vector shuffles otherwise IR is the same. |
I'm observing mysterious slowdowns when using different integer types. As a MWE, consider the following two functionally equivalent functions
f1
andf2
:On Julia 1.10.2 I get similar benchmarks for
f1
andf2
withInt16
, but 20x slowdown forf1
andInt32
.With Julia 1.11.0-beta1 I get slowdowns for both
f1
andf2
when usingInt32
:EDIT: This seems to depend on the processor. On the machine indicated below I get
The text was updated successfully, but these errors were encountered: