You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2023/08/07 22:04:15 UTC

[impala] branch master updated: IMPALA-12314: Pre-compile LLVM bytecode with Os

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 8329e6f2e IMPALA-12314: Pre-compile LLVM bytecode with Os
8329e6f2e is described below

commit 8329e6f2e39463d01b549ae84518d131b848adff
Author: Michael Smith <mi...@cloudera.com>
AuthorDate: Fri Jul 21 15:51:09 2023 -0700

    IMPALA-12314: Pre-compile LLVM bytecode with Os
    
    Functions used in codegen fragments are compiled into the binary and
    also compiled into LLVM bytecode that's embedded in the binary. The LLVM
    bytecode is first optimized with clang at O1; however the evaluation of
    which optimization level to use was performed with LLVM 3.3, and we're
    now on LLVM 5. Re-testing with our current performance suite shows Os
    performs the best of available optimization options (O1, O2, O3, Os).
    
    With codegen cache disabled we see across-the-board improvement:
    +----------+-----------------------+---------+------------+------------+----------------+
    | Workload | File Format           | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) |
    +----------+-----------------------+---------+------------+------------+----------------+
    | TPCH(42) | parquet / none / none | 3.28    | -2.33%     | 2.44       | -3.42%         |
    +----------+-----------------------+---------+------------+------------+----------------+
    
    +----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------+----------------+---------+--------+
    | Workload | Query    | File Format           | Avg(s) | Base Avg(s) | Delta(Avg) | StdDev(%) | Base StdDev(%) | Iters | Median Diff(%) | MW Zval | Tval   |
    +----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------+----------------+---------+--------+
    | TPCH(42) | TPCH-Q4  | parquet / none / none | 1.78   | 1.80        |   -1.05%   |   2.05%   |   1.62%        | 50    |   -0.32%       | -3.98   | -2.86  |
    | TPCH(42) | TPCH-Q3  | parquet / none / none | 6.51   | 6.57        |   -1.00%   |   0.72%   |   0.54%        | 50    |   -0.83%       | -5.87   | -7.88  |
    | TPCH(42) | TPCH-Q21 | parquet / none / none | 13.43  | 13.57       |   -1.07%   |   0.39%   |   0.47%        | 50    |   -1.12%       | -7.90   | -12.48 |
    | TPCH(42) | TPCH-Q6  | parquet / none / none | 0.79   | 0.81        |   -2.34%   |   3.13%   |   0.93%        | 50    |   -0.31%       | -2.32   | -5.19  |
    | TPCH(42) | TPCH-Q9  | parquet / none / none | 8.65   | 8.78        |   -1.43%   |   0.50%   |   0.55%        | 50    |   -1.45%       | -7.88   | -13.67 |
    | TPCH(42) | TPCH-Q15 | parquet / none / none | 2.57   | 2.60        |   -1.30%   |   1.13%   |   1.02%        | 50    |   -1.75%       | -4.50   | -6.06  |
    | TPCH(42) | TPCH-Q5  | parquet / none / none | 2.25   | 2.30        |   -2.27%   |   1.26%   |   1.21%        | 50    |   -2.28%       | -7.24   | -9.32  |
    | TPCH(42) | TPCH-Q12 | parquet / none / none | 1.62   | 1.65        |   -1.94%   |   1.67%   |   1.45%        | 50    |   -2.69%       | -5.71   | -6.27  |
    | TPCH(42) | TPCH-Q18 | parquet / none / none | 4.90   | 5.02        |   -2.39%   |   1.54%   |   1.02%        | 50    |   -2.41%       | -7.30   | -9.31  |
    | TPCH(42) | TPCH-Q13 | parquet / none / none | 5.71   | 5.85        |   -2.33%   |   1.87%   |   1.70%        | 50    |   -2.59%       | -5.68   | -6.61  |
    | TPCH(42) | TPCH-Q14 | parquet / none / none | 1.66   | 1.70        |   -2.17%   |   2.01%   |   1.75%        | 50    |   -2.87%       | -4.76   | -5.84  |
    | TPCH(42) | TPCH-Q7  | parquet / none / none | 2.69   | 2.76        |   -2.62%   |   1.43%   |   1.24%        | 50    |   -2.77%       | -7.07   | -9.95  |
    | TPCH(42) | TPCH-Q19 | parquet / none / none | 1.98   | 2.04        |   -3.21%   |   1.31%   |   1.73%        | 50    |   -2.62%       | -7.19   | -10.58 |
    | TPCH(42) | TPCH-Q17 | parquet / none / none | 1.86   | 1.92        |   -3.15%   |   1.62%   |   1.75%        | 50    |   -2.92%       | -7.14   | -9.47  |
    | TPCH(42) | TPCH-Q8  | parquet / none / none | 3.61   | 3.73        |   -3.20%   |   0.98%   |   1.05%        | 50    |   -3.09%       | -8.26   | -15.96 |
    | TPCH(42) | TPCH-Q1  | parquet / none / none | 2.98   | 3.08        |   -3.16%   |   1.23%   |   1.30%        | 50    |   -3.33%       | -7.64   | -12.64 |
    | TPCH(42) | TPCH-Q22 | parquet / none / none | 1.55   | 1.60        |   -3.45%   |   2.25%   |   1.89%        | 50    |   -3.28%       | -5.87   | -8.50  |
    | TPCH(42) | TPCH-Q10 | parquet / none / none | 2.81   | 2.90        |   -3.31%   |   1.20%   |   2.18%        | 50    |   -3.47%       | -7.67   | -9.48  |
    | TPCH(42) | TPCH-Q20 | parquet / none / none | 1.88   | 1.97        |   -4.42%   |   1.20%   |   1.42%        | 50    |   -5.06%       | -8.52   | -17.12 |
    | TPCH(42) | TPCH-Q16 | parquet / none / none | 1.04   | 1.10        | I -5.43%   |   1.28%   |   1.86%        | 50    | I -4.91%       | -8.28   | -17.29 |
    | TPCH(42) | TPCH-Q2  | parquet / none / none | 1.05   | 1.13        | I -6.88%   |   2.26%   |   2.12%        | 50    | I -8.96%       | -7.93   | -16.30 |
    | TPCH(42) | TPCH-Q11 | parquet / none / none | 0.74   | 0.88        | I -15.95%  |   3.36%   |   3.73%        | 50    | I -20.48%      | -8.50   | -24.09 |
    +----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------+----------------+---------+--------+
    
    (I) Improvement: TPCH(42) TPCH-Q16 [parquet / none / none] (1.10s -> 1.04s [-5.43%])
    +---------------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+--------+-----------+
    | Operator            | % of Query | Avg      | Base Avg | Delta(Avg) | StdDev(%)  | Max      | Base Max | Delta(Max) | #Hosts | #Inst | #Rows  | Est #Rows |
    +---------------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+--------+-----------+
    | 11:AGGREGATE        | 9.62%      | 116.74ms | 114.24ms | +2.19%     |   4.85%    | 236.00ms | 228.00ms | +3.51%     | 3      | 15    | 4.98M  | 1.28M     |
    | F00:EXCHANGE SENDER | 12.71%     | 154.30ms | 150.21ms | +2.72%     |   5.93%    | 232.00ms | 216.00ms | +7.41%     | 3      | 15    | -1     | -1        |
    | 05:AGGREGATE        | 6.62%      | 80.35ms  | 79.17ms  | +1.50%     |   5.18%    | 164.00ms | 152.00ms | +7.89%     | 3      | 15    | 4.99M  | 1.28M     |
    | 02:SCAN HDFS        | 20.99%     | 254.69ms | 253.55ms | +0.45%     |   9.62%    | 316.00ms | 308.00ms | +2.60%     | 1      | 1     | 207    | 42.00K    |
    | 03:HASH JOIN        | 6.40%      | 77.68ms  | 83.29ms  | -6.74%     |   5.80%    | 156.00ms | 164.00ms | -4.88%     | 3      | 15    | 4.99M  | 1.28M     |
    | F07:JOIN BUILD      | 14.37%     | 174.42ms | 183.92ms | -5.16%     |   6.35%    | 228.00ms | 240.00ms | -5.00%     | 3      | 3     | -1     | -1        |
    | F01:EXCHANGE SENDER | 4.90%      | 59.46ms  | 60.27ms  | -1.34%     | * 12.04% * | 124.00ms | 136.00ms | -8.82%     | 3      | 8     | -1     | -1        |
    | 01:SCAN HDFS        | 7.37%      | 89.43ms  | 89.69ms  | -0.30%     |   6.22%    | 140.00ms | 160.00ms | -12.50%    | 3      | 8     | 1.25M  | 331.46K   |
    | 00:SCAN HDFS        | 5.67%      | 68.84ms  | 68.41ms  | +0.64%     |   6.78%    | 132.00ms | 128.00ms | +3.12%     | 3      | 15    | 33.60M | 33.60M    |
    +---------------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+--------+-----------+
    
    (I) Improvement: TPCH(42) TPCH-Q2 [parquet / none / none] (1.13s -> 1.05s [-6.88%])
    +--------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+---------+-----------+
    | Operator     | % of Query | Avg      | Base Avg | Delta(Avg) | StdDev(%)  | Max      | Base Max | Delta(Max) | #Hosts | #Inst | #Rows   | Est #Rows |
    +--------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+---------+-----------+
    | 01:SCAN HDFS | 4.61%      | 76.65ms  | 73.71ms  | +3.99%     | * 18.60% * | 120.00ms | 108.00ms | +11.11%    | 1      | 1     | 84.23K  | 420.00K   |
    | 00:SCAN HDFS | 2.51%      | 41.82ms  | 44.19ms  | -5.38%     | * 15.65% * | 108.00ms | 112.00ms | -3.57%     | 3      | 8     | 33.42K  | 53.13K    |
    | 02:SCAN HDFS | 19.49%     | 324.36ms | 314.51ms | +3.13%     |   8.44%    | 484.00ms | 496.00ms | -2.42%     | 3      | 15    | 2.36M   | 33.60M    |
    | 07:SCAN HDFS | 16.68%     | 277.63ms | 295.43ms | -6.02%     | * 12.45% * | 328.00ms | 348.00ms | -5.75%     | 1      | 1     | 5       | 25        |
    | 06:SCAN HDFS | 18.46%     | 307.18ms | 326.86ms | -6.02%     | * 11.80% * | 360.00ms | 400.00ms | -10.00%    | 1      | 1     | 84.23K  | 420.00K   |
    | 05:SCAN HDFS | 28.94%     | 481.61ms | 470.92ms | +2.27%     |   5.51%    | 640.00ms | 616.00ms | +3.90%     | 3      | 15    | 493.60K | 33.60M    |
    +--------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+---------+-----------+
    
    (I) Improvement: TPCH(42) TPCH-Q11 [parquet / none / none] (0.88s -> 0.74s [-15.95%])
    +--------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+--------+-----------+
    | Operator     | % of Query | Avg      | Base Avg | Delta(Avg) | StdDev(%)  | Max      | Base Max | Delta(Max) | #Hosts | #Inst | #Rows  | Est #Rows |
    +--------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+--------+-----------+
    | 07:SCAN HDFS | 9.77%      | 61.88ms  | 57.88ms  | +6.91%     | * 35.58% * | 104.00ms | 104.00ms | -0.00%     | 1      | 1     | 16.88K | 420.00K   |
    | 06:SCAN HDFS | 32.20%     | 203.97ms | 197.21ms | +3.43%     |   9.44%    | 344.00ms | 352.00ms | -2.27%     | 3      | 15    | 1.35M  | 33.60M    |
    | 01:SCAN HDFS | 8.65%      | 54.78ms  | 47.35ms  | +15.69%    | * 57.37% * | 104.00ms | 116.00ms | -10.35%    | 1      | 1     | 16.88K | 420.00K   |
    | 00:SCAN HDFS | 34.70%     | 219.78ms | 221.34ms | -0.71%     |   8.53%    | 356.00ms | 364.00ms | -2.20%     | 3      | 15    | 1.35M  | 33.60M    |
    +--------------+------------+----------+----------+------------+------------+----------+----------+------------+--------+-------+--------+-----------+
    
    With codegen cache enabled - highlighting generated code performance -
    we still see improvement, although not as definitive:
    +----------+-----------------------+---------+------------+------------+----------------+
    | Workload | File Format           | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) |
    +----------+-----------------------+---------+------------+------------+----------------+
    | TPCH(42) | parquet / none / none | 3.17    | -1.77%     | 2.25       | -1.41%         |
    +----------+-----------------------+---------+------------+------------+----------------+
    
    +----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------+----------------+---------+--------+
    | Workload | Query    | File Format           | Avg(s) | Base Avg(s) | Delta(Avg) | StdDev(%) | Base StdDev(%) | Iters | Median Diff(%) | MW Zval | Tval   |
    +----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------+----------------+---------+--------+
    | TPCH(42) | TPCH-Q1  | parquet / none / none | 2.80   | 2.74        |   +1.84%   |   1.65%   |   2.25%        | 50    |   +1.93%       | 4.76    | 4.64   |
    | TPCH(42) | TPCH-Q15 | parquet / none / none | 2.47   | 2.45        |   +0.81%   |   1.56%   |   1.47%        | 50    |   +0.34%       | 2.99    | 2.67   |
    | TPCH(42) | TPCH-Q14 | parquet / none / none | 1.66   | 1.64        |   +0.75%   |   1.94%   |   2.66%        | 50    |   +0.23%       | 3.58    | 1.60   |
    | TPCH(42) | TPCH-Q12 | parquet / none / none | 1.60   | 1.60        |   +0.12%   |   2.86%   |   2.55%        | 50    |   +0.19%       | 1.44    | 0.21   |
    | TPCH(42) | TPCH-Q13 | parquet / none / none | 6.63   | 6.62        |   +0.15%   |   7.42%   |   8.26%        | 50    |   +0.08%       | 0.26    | 0.10   |
    | TPCH(42) | TPCH-Q10 | parquet / none / none | 2.79   | 2.79        |   -0.31%   |   4.14%   |   2.50%        | 50    |   -0.11%       | -0.55   | -0.45  |
    | TPCH(42) | TPCH-Q16 | parquet / none / none | 0.89   | 0.89        |   -0.52%   |   2.28%   |   2.54%        | 50    |   +0.06%       | 0.76    | -1.08  |
    | TPCH(42) | TPCH-Q17 | parquet / none / none | 1.81   | 1.82        |   -0.65%   |   2.17%   |   1.88%        | 50    |   -0.08%       | -1.10   | -1.60  |
    | TPCH(42) | TPCH-Q11 | parquet / none / none | 0.52   | 0.52        |   -0.97%   |   2.13%   |   2.25%        | 50    |   +0.12%       | 1.19    | -2.23  |
    | TPCH(42) | TPCH-Q20 | parquet / none / none | 1.73   | 1.75        |   -1.22%   |   1.67%   |   1.76%        | 50    |   -0.27%       | -3.08   | -3.59  |
    | TPCH(42) | TPCH-Q6  | parquet / none / none | 0.79   | 0.80        |   -1.63%   |   3.14%   |   2.58%        | 50    |   -0.13%       | -2.88   | -2.86  |
    | TPCH(42) | TPCH-Q19 | parquet / none / none | 1.27   | 1.29        |   -1.51%   |   2.02%   |   1.95%        | 50    |   -0.36%       | -3.91   | -3.82  |
    | TPCH(42) | TPCH-Q21 | parquet / none / none | 13.20  | 13.35       |   -1.10%   |   0.48%   |   0.53%        | 50    |   -1.13%       | -7.27   | -10.86 |
    | TPCH(42) | TPCH-Q3  | parquet / none / none | 6.45   | 6.57        |   -1.75%   |   1.21%   |   1.48%        | 50    |   -1.59%       | -5.23   | -6.53  |
    | TPCH(42) | TPCH-Q7  | parquet / none / none | 2.48   | 2.53        |   -2.06%   |   2.16%   |   2.50%        | 50    |   -2.10%       | -3.99   | -4.44  |
    | TPCH(42) | TPCH-Q18 | parquet / none / none | 4.70   | 4.81        |   -2.24%   |   2.39%   |   2.32%        | 50    |   -2.18%       | -4.86   | -4.82  |
    | TPCH(42) | TPCH-Q5  | parquet / none / none | 2.16   | 2.21        |   -2.12%   |   2.30%   |   2.14%        | 50    |   -2.38%       | -4.65   | -4.84  |
    | TPCH(42) | TPCH-Q4  | parquet / none / none | 1.72   | 1.75        |   -2.09%   |   2.34%   |   2.18%        | 50    |   -2.88%       | -4.52   | -4.66  |
    | TPCH(42) | TPCH-Q8  | parquet / none / none | 3.50   | 3.59        |   -2.63%   |   2.34%   |   2.79%        | 50    |   -2.86%       | -4.52   | -5.16  |
    | TPCH(42) | TPCH-Q22 | parquet / none / none | 1.50   | 1.54        |   -2.53%   |   4.70%   |   3.77%        | 50    |   -3.34%       | -2.99   | -3.02  |
    | TPCH(42) | TPCH-Q2  | parquet / none / none | 0.75   | 0.79        |   -4.37%   |   2.93%   |   1.77%        | 50    |   -6.02%       | -5.93   | -9.32  |
    | TPCH(42) | TPCH-Q9  | parquet / none / none | 8.28   | 8.87        | I -6.66%   |   0.89%   |   1.25%        | 50    | I -7.27%       | -8.53   | -31.40 |
    +----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------+----------------+---------+--------+
    
    (I) Improvement: TPCH(42) TPCH-Q9 [parquet / none / none] (8.87s -> 8.28s [-6.66%])
    +---------------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+-------+---------+-----------+
    | Operator            | % of Query | Avg      | Base Avg | Delta(Avg) | StdDev(%) | Max      | Base Max | Delta(Max) | #Hosts | #Inst | #Rows   | Est #Rows |
    +---------------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+-------+---------+-----------+
    | F11:JOIN BUILD      | 7.52%      | 1.38s    | 1.34s    | +2.94%     |   3.85%   | 2.15s    | 1.83s    | +17.25%    | 3      | 15    | -1      | -1        |
    | F05:EXCHANGE SENDER | 7.78%      | 1.43s    | 1.32s    | +8.21%     |   3.54%   | 2.17s    | 2.10s    | +3.63%     | 3      | 15    | -1      | -1        |
    | F04:EXCHANGE SENDER | 5.82%      | 1.07s    | 1.13s    | -5.57%     |   3.76%   | 1.54s    | 1.68s    | -8.57%     | 3      | 15    | -1      | -1        |
    | F12:JOIN BUILD      | 5.33%      | 978.25ms | 886.69ms | +10.33%    |   5.86%   | 1.48s    | 1.32s    | +12.43%    | 3      | 15    | -1      | -1        |
    | F03:EXCHANGE SENDER | 3.40%      | 623.44ms | 606.68ms | +2.76%     |   4.22%   | 1.05s    | 991.98ms | +5.65%     | 3      | 15    | -1      | -1        |
    | F00:EXCHANGE SENDER | 5.67%      | 1.04s    | 1.10s    | -5.39%     |   2.28%   | 1.51s    | 1.60s    | -5.51%     | 3      | 15    | -1      | -1        |
    | 01:SCAN HDFS        | 10.25%     | 1.88s    | 2.02s    | -7.03%     |   4.28%   | 2.05s    | 2.16s    | -5.17%     | 1      | 1     | 420.00K | 420.00K   |
    | 06:HASH JOIN        | 2.58%      | 473.29ms | 517.49ms | -8.54%     |   3.72%   | 876.00ms | 811.98ms | +7.88%     | 3      | 15    | 13.72M  | 24.34M    |
    | 00:SCAN HDFS        | 12.79%     | 2.35s    | 2.43s    | -3.48%     |   2.35%   | 2.62s    | 2.72s    | -3.82%     | 3      | 8     | 457.14K | 840.00K   |
    | 02:SCAN HDFS        | 28.98%     | 5.32s    | 5.65s    | -5.78%     |   1.22%   | 6.21s    | 6.52s    | -4.66%     | 3      | 15    | 68.38M  | 252.01M   |
    +---------------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+-------+---------+-----------+
    
    It would be useful to get more evaluation of this option. Adds a new
    startup option 'llvm_ir_opt' to select from several pre-optimized
    versions of LLVM bytecode that provide library functions for our
    generated code. Available options are O1, O2, and Os, defaulting to Os.
    Release binary size increases by ~6.5MB.
    
    Leaves impala_legacy_avx_llvm_ir untouched.
    
    Change-Id: I6dd1a07ce63dbc2c27b00f450e11eceaa7bb0822
    Reviewed-on: http://gerrit.cloudera.org:8080/20265
    Reviewed-by: Daniel Becker <da...@cloudera.com>
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/CMakeLists.txt               |  8 ++--
 be/src/codegen/CMakeLists.txt   | 96 ++++++++++++++++-------------------------
 be/src/codegen/impala-ir-data.h | 10 ++++-
 be/src/codegen/llvm-codegen.cc  | 16 ++++++-
 4 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 572d944d3..2b367bbe8 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -288,9 +288,7 @@ add_definitions(-DKUDU_HEADERS_USE_RICH_SLICE -DKUDU_HEADERS_NO_STUBS)
 
 # Set clang flags for cross-compiling to IR.
 # IR_COMPILE is #defined for the cross compile to remove code that bloats the IR.
-# We enable basic optimizations (-O1) to reduce the IR size and speed up runtime JIT.
-# Empirically, the runtime JIT produces slightly better code when starting with IR that
-# was optimized at -O1. Higher optimization levels tend to bloat the code.
+# Optimization is omitted and left up to individual uses.
 #  -Wno-deprecated: gutil contains deprecated headers
 #  -Wno-return-type-c-linkage: UDFs return C++ classes but use C linkage to prevent
 #       mangling
@@ -299,7 +297,7 @@ set(CLANG_IR_CXX_FLAGS "-emit-llvm" "-c" "-std=c++17" "-DIR_COMPILE" "-DHAVE_INT
   "-DHAVE_NETINET_IN_H" "-DBOOST_DATE_TIME_POSIX_TIME_STD_CONFIG" "-DBOOST_NO_EXCEPTIONS"
   "-DBOOST_BIND_GLOBAL_PLACEHOLDERS" "-DBOOST_ALLOW_DEPRECATED_HEADERS"
   "-DKUDU_HEADERS_NO_STUBS" "-fcolor-diagnostics" "-Wno-deprecated"
-  "-Wno-return-type-c-linkage" "-fsigned-char" "-O1")
+  "-Wno-return-type-c-linkage" "-fsigned-char")
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
   set(CLANG_IR_CXX_FLAGS "${CLANG_IR_CXX_FLAGS}" "-march=armv8-a+crc"
@@ -808,7 +806,7 @@ function(COMPILE_TO_IR SRC_FILE)
   set(OUTPUT_FILE "${LIBRARY_OUTPUT_PATH}/${BASE_NAME}.ll")
   add_custom_command(
     OUTPUT ${OUTPUT_FILE}
-    COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} ${HIDE_SYMBOLS_ARGS}
+    COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} -O2 ${HIDE_SYMBOLS_ARGS}
             ${CLANG_INCLUDE_FLAGS} ${SRC_FILE} -o ${OUTPUT_FILE}
     DEPENDS ${SRC_FILE})
   add_custom_target(${BASE_NAME}-ir ALL DEPENDS ${OUTPUT_FILE})
diff --git a/be/src/codegen/CMakeLists.txt b/be/src/codegen/CMakeLists.txt
index 870e1f3f5..4c5f497bd 100644
--- a/be/src/codegen/CMakeLists.txt
+++ b/be/src/codegen/CMakeLists.txt
@@ -23,7 +23,9 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/codegen")
 set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/codegen")
 
 # Generated C files for IR
-set(IR_C_FILE $ENV{IMPALA_HOME}/be/generated-sources/impala-ir/impala-ir.cc)
+set(IR_O1_C_FILE $ENV{IMPALA_HOME}/be/generated-sources/impala-ir/impala-ir-o1.cc)
+set(IR_O2_C_FILE $ENV{IMPALA_HOME}/be/generated-sources/impala-ir/impala-ir-o2.cc)
+set(IR_Os_C_FILE $ENV{IMPALA_HOME}/be/generated-sources/impala-ir/impala-ir-os.cc)
 set(LEGACY_AVX_IR_C_FILE $ENV{IMPALA_HOME}/be/generated-sources/impala-ir/impala-ir-legacy-avx.cc)
 
 add_library(CodeGen
@@ -35,7 +37,9 @@ add_library(CodeGen
   llvm-codegen.cc
   llvm-codegen-cache.cc
   instruction-counter.cc
-  ${IR_C_FILE}
+  ${IR_O1_C_FILE}
+  ${IR_O2_C_FILE}
+  ${IR_Os_C_FILE}
   ${LEGACY_AVX_IR_C_FILE}
 )
 add_dependencies(CodeGen gen-deps)
@@ -61,71 +65,47 @@ add_custom_target(gen_ir_descriptions ALL DEPENDS ${IR_DESC_GEN_OUTPUT})
 
 set(IR_INPUT_FILES impala-ir.cc)
 
-# Set of files for generating the regular IR
-set(IR_TMP_OUTPUT_FILE "${LLVM_IR_OUTPUT_DIRECTORY}/impala-tmp.bc")
-set(IR_OUTPUT_FILE "${LLVM_IR_OUTPUT_DIRECTORY}/impala.bc")
-set(IR_TMP_C_FILE ${IR_C_FILE}.tmp)
-
-# Set of files for generating the legacy AVX IR
-# This is generated on all platforms, but it is only used for x86_64
-set(LEGACY_AVX_IR_TMP_OUTPUT_FILE "${LLVM_IR_OUTPUT_DIRECTORY}/impala-tmp-legacy-avx.bc")
-set(LEGACY_AVX_IR_OUTPUT_FILE "${LLVM_IR_OUTPUT_DIRECTORY}/impala-legacy-avx.bc")
-set(LEGACY_AVX_IR_TMP_C_FILE ${LEGACY_AVX_IR_C_FILE}.tmp)
-
-# Run the clang compiler to generate IR. Then run the LLVM opt tool to apply specific
-# optimisations. We need to compile to IR twice, once with regular options and one
-# with legacy AVX support.
-# At runtime impala will pick the file to load based on a startup parameter.
+function(COMPILE_TO_IR_C_ARRAY IR_C_FILE VARNAME OPT PLATFORM_SPECIFIC)
+  # Run the clang compiler to generate IR. Then run the LLVM opt tool to apply specific
+  # optimisations. We need to compile to IR several times for different optimization settings
+  # and legacy AVX support.
+  get_filename_component(BASE_NAME ${IR_C_FILE} NAME_WE)
+  set(IR_OUTPUT_FILE "${LLVM_IR_OUTPUT_DIRECTORY}/${BASE_NAME}.bc")
+  set(IR_TMP_OUTPUT_FILE "${LLVM_IR_OUTPUT_DIRECTORY}/${BASE_NAME}-tmp.bc")
+  add_custom_command(
+    OUTPUT ${IR_OUTPUT_FILE}
+    COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} ${OPT} ${PLATFORM_SPECIFIC}
+            ${CLANG_INCLUDE_FLAGS} ${IR_INPUT_FILES} -o ${IR_TMP_OUTPUT_FILE}
+    COMMAND ${LLVM_OPT_EXECUTABLE} ${LLVM_OPT_IR_FLAGS} < ${IR_TMP_OUTPUT_FILE} > ${IR_OUTPUT_FILE}
+    COMMAND rm ${IR_TMP_OUTPUT_FILE}
+    DEPENDS ExecIr ExecAvroIr ExecKuduIr ExprsIr RuntimeIr UdfIr UtilIr ${IR_INPUT_FILES}
+  )
+
+  # Convert LLVM bytecode to C array.
+  set(IR_TMP_C_FILE ${IR_C_FILE}.tmp)
+  add_custom_command(
+    OUTPUT ${IR_C_FILE}
+    COMMAND $ENV{IMPALA_HOME}/bin/file2array.sh -n -v ${VARNAME} ${IR_OUTPUT_FILE} > ${IR_TMP_C_FILE}
+    COMMAND mv ${IR_TMP_C_FILE} ${IR_C_FILE}
+    DEPENDS $ENV{IMPALA_HOME}/bin/file2array.sh
+    DEPENDS ${IR_OUTPUT_FILE}
+  )
+endfunction(COMPILE_TO_IR_C_ARRAY)
 
 if (NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
   set(PLATFORM_SPECIFIC_FLAGS "-mavx2")
   set(LEGACY_AVX_SPECIFIC_FLAGS "-mavx")
 endif()
-add_custom_command(
-  OUTPUT ${IR_OUTPUT_FILE}
-  COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} ${PLATFORM_SPECIFIC_FLAGS}
-          ${CLANG_INCLUDE_FLAGS} ${IR_INPUT_FILES} -o ${IR_TMP_OUTPUT_FILE}
-  COMMAND ${LLVM_OPT_EXECUTABLE} ${LLVM_OPT_IR_FLAGS} < ${IR_TMP_OUTPUT_FILE} > ${IR_OUTPUT_FILE}
-  COMMAND rm ${IR_TMP_OUTPUT_FILE}
-  DEPENDS ExecIr ExecAvroIr ExecKuduIr ExprsIr RuntimeIr UdfIr UtilIr ${IR_INPUT_FILES}
-)
-
-add_custom_command(
-  OUTPUT ${LEGACY_AVX_IR_OUTPUT_FILE}
-  COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} ${LEGACY_AVX_SPECIFIC_FLAGS}
-          ${CLANG_INCLUDE_FLAGS} ${IR_INPUT_FILES} -o ${LEGACY_AVX_IR_TMP_OUTPUT_FILE}
-  COMMAND ${LLVM_OPT_EXECUTABLE} ${LLVM_OPT_IR_FLAGS} < ${LEGACY_AVX_IR_TMP_OUTPUT_FILE}
-          > ${LEGACY_AVX_IR_OUTPUT_FILE}
-  COMMAND rm ${LEGACY_AVX_IR_TMP_OUTPUT_FILE}
-  DEPENDS ExecIr ExecAvroIr ExecKuduIr ExprsIr RuntimeIr UdfIr UtilIr ${IR_INPUT_FILES}
-)
-
-add_custom_target(compile_to_ir DEPENDS ${IR_OUTPUT_FILE})
-add_custom_target(compile_to_ir_legacy_avx DEPENDS ${LEGACY_AVX_IR_OUTPUT_FILE})
-
-# Convert LLVM bytecode to C array.
-add_custom_command(
-  OUTPUT ${IR_C_FILE}
-  COMMAND $ENV{IMPALA_HOME}/bin/file2array.sh -n -v impala_llvm_ir ${IR_OUTPUT_FILE} > ${IR_TMP_C_FILE}
-  COMMAND mv ${IR_TMP_C_FILE} ${IR_C_FILE}
-  DEPENDS $ENV{IMPALA_HOME}/bin/file2array.sh
-  DEPENDS ${IR_OUTPUT_FILE}
-)
-
-# Convert LLVM bytecode to C array.
-add_custom_command(
-  OUTPUT ${LEGACY_AVX_IR_C_FILE}
-  COMMAND $ENV{IMPALA_HOME}/bin/file2array.sh -n -v impala_legacy_avx_llvm_ir
-          ${LEGACY_AVX_IR_OUTPUT_FILE} > ${LEGACY_AVX_IR_TMP_C_FILE}
-  COMMAND mv ${LEGACY_AVX_IR_TMP_C_FILE} ${LEGACY_AVX_IR_C_FILE}
-  DEPENDS $ENV{IMPALA_HOME}/bin/file2array.sh
-  DEPENDS ${LEGACY_AVX_IR_OUTPUT_FILE}
-)
+# At runtime impala will pick the file to load based on a startup parameter.
+COMPILE_TO_IR_C_ARRAY(${IR_O1_C_FILE} impala_llvm_o1_ir -O1 ${PLATFORM_SPECIFIC_FLAGS})
+COMPILE_TO_IR_C_ARRAY(${IR_O2_C_FILE} impala_llvm_o2_ir -O2 ${PLATFORM_SPECIFIC_FLAGS})
+COMPILE_TO_IR_C_ARRAY(${IR_Os_C_FILE} impala_llvm_os_ir -Os ${PLATFORM_SPECIFIC_FLAGS})
+COMPILE_TO_IR_C_ARRAY(${LEGACY_AVX_IR_C_FILE} impala_legacy_avx_llvm_ir -O1 ${LEGACY_AVX_SPECIFIC_FLAGS})
 
 
 # Run the clang compiler to generate BC for llvm-codegen-test
 add_custom_target(test-loop.bc
-  COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} ${CLANG_INCLUDE_FLAGS} ${CMAKE_SOURCE_DIR}/testdata/llvm/test-loop.cc -o ${CMAKE_SOURCE_DIR}/llvm-ir/test-loop.bc
+  COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} -O2 ${CLANG_INCLUDE_FLAGS} ${CMAKE_SOURCE_DIR}/testdata/llvm/test-loop.cc -o ${CMAKE_SOURCE_DIR}/llvm-ir/test-loop.bc
   SOURCES ${CMAKE_SOURCE_DIR}/testdata/llvm/test-loop.cc
 )
 
diff --git a/be/src/codegen/impala-ir-data.h b/be/src/codegen/impala-ir-data.h
index 758580816..09818244b 100644
--- a/be/src/codegen/impala-ir-data.h
+++ b/be/src/codegen/impala-ir-data.h
@@ -22,8 +22,14 @@
 #ifndef IMPALA_CODEGEN_IR_DATA_H
 #define IMPALA_CODEGEN_IR_DATA_H
 
-extern const unsigned char impala_llvm_ir[];
-extern const size_t impala_llvm_ir_len;
+extern const unsigned char impala_llvm_o1_ir[];
+extern const size_t impala_llvm_o1_ir_len;
+
+extern const unsigned char impala_llvm_o2_ir[];
+extern const size_t impala_llvm_o2_ir_len;
+
+extern const unsigned char impala_llvm_os_ir[];
+extern const size_t impala_llvm_os_ir_len;
 
 extern const unsigned char impala_legacy_avx_llvm_ir[];
 extern const size_t impala_legacy_avx_llvm_ir_len;
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index b27033aba..5e2f8be24 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -125,6 +125,8 @@ DEFINE_string_hidden(llvm_cpu_attr_whitelist, "adx,aes,avx,avx2,bmi,bmi2,cmov,cx
     "routinely tested. This flag is provided to enable additional LLVM CPU attribute "
     "flags for testing.");
 #endif
+DEFINE_string_hidden(llvm_ir_opt, "Os",
+    "The IR optimization level for pre-generated code; supports O1, O2, and Os.");
 DECLARE_bool(enable_legacy_avx_support);
 
 namespace impala {
@@ -279,8 +281,18 @@ Status LlvmCodeGen::CreateFromMemory(FragmentState* state, ObjectPool* pool,
   // LLVM IR to use.
   if (IsCPUFeatureEnabled(CpuInfo::AVX2)) {
     // Use the default IR that supports AVX2
-    module_ir = llvm::StringRef(
-        reinterpret_cast<const char*>(impala_llvm_ir), impala_llvm_ir_len);
+    if (FLAGS_llvm_ir_opt == "O1") {
+      module_ir = llvm::StringRef(
+          reinterpret_cast<const char*>(impala_llvm_o1_ir), impala_llvm_o1_ir_len);
+    } else if (FLAGS_llvm_ir_opt == "O2") {
+      module_ir = llvm::StringRef(
+          reinterpret_cast<const char*>(impala_llvm_o2_ir), impala_llvm_o2_ir_len);
+    } else if (FLAGS_llvm_ir_opt == "Os") {
+      module_ir = llvm::StringRef(
+          reinterpret_cast<const char*>(impala_llvm_os_ir), impala_llvm_os_ir_len);
+    } else {
+      CHECK(false) << "llvm_ir_opt flag invalid; try O1, O2, or Os.";
+    }
     module_name = "Impala IR with AVX2 support";
   } else if (FLAGS_enable_legacy_avx_support && IsCPUFeatureEnabled(CpuInfo::AVX)) {
     // If there is no AVX but legacy mode is enabled, use legacy IR with AVX support