62 files changed, 949 insertions, 384 deletions
diff --git a/flang/docs/AssumedRank.md b/flang/docs/AssumedRank.md
index c5d2c3e..0154adc 100644
--- a/flang/docs/AssumedRank.md
+++ b/flang/docs/AssumedRank.md
@@ -101,9 +101,9 @@ Assumed-rank dummies are also represented in the
 represent assumed-rank in procedure characteristics.
 
 ### Runtime Representation of Assumed-Ranks
-Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of an
-f18 specific addendum when required for the type. This is the usual f18
-descriptor, and no changes is required to represent assumed-ranks in this data
+Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of a
+Flang specific addendum when required for the type. This is the usual Flang
+descriptor, and no changes are required to represent assumed-ranks in this data
 structure. In fact, there is no difference between the runtime descriptor
 created for an assumed shape and the runtime descriptor created when the
 corresponding entity is passed as an assumed-rank.
diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md
index f36110a..9137827 100644
--- a/flang/docs/C++17.md
+++ b/flang/docs/C++17.md
@@ -6,7 +6,7 @@
   
 -->
 
-# C++14/17 features used in f18
+# C++14/17 features used in Flang
 
 ```{contents}
 ---
@@ -27,7 +27,7 @@ out the details of how our C++ code should look and gives
 guidance about feature usage.
 
 We have chosen to use some features of the recent C++17
-language standard in f18.
+language standard in Flang.
 The most important of these are:
 * sum types (discriminated unions) in the form of `std::variant`
 * `using` template parameter packs
@@ -41,7 +41,7 @@ in this list because it's not particularly well known.)
 ## Sum types
 
 First, some background information to explain the need for sum types
-in f18.
+in Flang.
 
 Fortran is notoriously problematic to lex and parse, as tokenization
 depends on the state of the partial parse;
@@ -57,7 +57,7 @@ a unified lexer/parser.
 We have chosen to do so because it is simpler and should reduce
 both initial bugs and long-term maintenance.
 
-Specifically, f18's parser uses the technique of recursive descent with
+Specifically, Flang's parser uses the technique of recursive descent with
 backtracking.
 It is constructed as the incremental composition of pure parsing functions
 that each, when given a context (location in the input stream plus some state),
@@ -73,7 +73,7 @@ of Fortran.
 
 The specification of Fortran uses a form of BNF with alternatives,
 optional elements, sequences, and lists.  Each of these constructs
-in the Fortran grammar maps directly in the f18 parser to both
+in the Fortran grammar maps directly in Flang's parser to both
 the means of combining other parsers as alternatives, &c., and to
 the declarations of the parse tree data structures that represent
 the results of successful parses.
@@ -87,10 +87,10 @@ The bounded polymorphism supplied by the C++17 `std::variant` fits
 those needs exactly.
 For example, production R502 in Fortran defines the top-level
 program unit of Fortran as being a function, subroutine, module, &c.
-The `struct ProgramUnit` in the f18 parse tree header file
+`struct ProgramUnit` in the Flang parse tree header file
 represents each program unit with a member that is a `std::variant`
 over the six possibilities.
-Similarly, the parser for that type in the f18 grammar has six alternatives,
+Similarly, the parser for that type in Flang's grammar has six alternatives,
 each of which constructs an instance of `ProgramUnit` upon the result of
 parsing a `Module`, `FunctionSubprogram`, and so on.
 
@@ -99,7 +99,7 @@ parse is typically implemented with overloaded functions.
 A function instantiated on `ProgramUnit` will use `std::visit` to
 identify the right alternative and perform the right actions.
 The call to `std::visit` must pass a visitor that can handle all
-of the possibilities, and f18 will fail to build if one is missing.
+of the possibilities, and Flang will fail to build if one is missing.
 
 Were we unable to use `std::variant` directly, we would likely
 have chosen to implement a local `SumType` replacement; in the
diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md
index cbb96f1..a4ca962 100644
--- a/flang/docs/C++style.md
+++ b/flang/docs/C++style.md
@@ -30,7 +30,7 @@ is clear on usage, follow it.
   is pretty good and comes with lots of justifications for its rules.
 * Reasonable exceptions to these guidelines can be made.
 * Be aware of some workarounds for known issues in older C++ compilers that should
-  still be able to compile f18. They are listed at the end of this document.
+  still be able to compile Flang. They are listed at the end of this document.
 
 ## In particular:
 
@@ -261,7 +261,7 @@ move semantics, member access, and comparison for equality; suitable for use in
 `std::variant<>`.
 * `std::unique_ptr<>`: A nullable pointer with ownership, null by default,
 not copyable, reassignable.
-F18 has a helpful `Deleter<>` class template that makes `unique_ptr<>`
+Flang has a helpful `Deleter<>` class template that makes `unique_ptr<>`
 easier to use with forward-referenced data types.
 * `std::shared_ptr<>`: A nullable pointer with shared ownership via reference
 counting, null by default, shallowly copyable, reassignable, and slow.
@@ -312,9 +312,9 @@ Consistency is one of many aspects in the pursuit of clarity,
 but not an end in itself.
 
 ## C++ compiler bug workarounds
-Below is a list of workarounds for C++ compiler bugs met with f18 that, even
-if the bugs are fixed in latest C++ compiler versions, need to be applied so
-that all desired tool-chains can compile f18.
+Below is a list of workarounds for C++ compiler bugs encountered when building
+Flang. Even if the bugs are fixed in latest C++ compiler versions, these need to
+be applied so that all desired tool-chains can compile Flang.
 
 ### Explicitly move noncopyable local variable into optional results
 
@@ -338,7 +338,7 @@ std::optional<CantBeCopied> fooOK() {
 }
 ```
 The underlying bug is actually not specific to `std::optional` but this is the most common
-case in f18 where the issue may occur. The actual bug can be reproduced with any class `B`
+case in Flang where the issue may occur. The actual bug can be reproduced with any class `B`
 that has a perfect forwarding constructor taking `CantBeCopied` as argument:
 `template<typename CantBeCopied> B(CantBeCopied&& x) x_{std::forward<CantBeCopied>(x)} {}`.
 In such scenarios, Ubuntu 18.04 g++ fails to instantiate the move constructor
diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md
index f518dc0..f27af1a 100644
--- a/flang/docs/Calls.md
+++ b/flang/docs/Calls.md
@@ -529,7 +529,7 @@ PGI passes host instance links in descriptors in additional arguments
 that are not always successfully forwarded across implicit interfaces,
 sometimes leading to crashes when they turn out to be needed.
 
-F18 will manage a pool of trampolines in its runtime support library
+Flang will manage a pool of trampolines in its runtime support library
 that can be used to pass internal procedures as effective arguments
 to F77ish procedures, so that
 a bare code address can serve to represent the effective argument.
@@ -569,14 +569,14 @@ Fortran 2018 explicitly enables us to do this with a correction to Fortran
 2003 in 4.3.4(5).
 
 Last, there must be reasonably permanent naming conventions used
-by the F18 runtime library for those unrestricted specific intrinsic
+by Flang's runtime library for those unrestricted specific intrinsic
 functions (table 16.2 in 16.8) and extensions that can be passed as
 arguments.
 
 In these cases where external naming is at the discretion
 of the implementation, we should use names that are not in the C language
 user namespace, begin with something that identifies
-the current incompatible version of F18, the module, the submodule, and
+the current incompatible version of Flang, the module, the submodule, and
 elemental SIMD width, and are followed by the external name.
 The parts of the external name can be separated by some character that
 is acceptable for use in LLVM IR and assembly language but not in user
diff --git a/flang/docs/Character.md b/flang/docs/Character.md
index 4e1d407..96e0a06 100644
--- a/flang/docs/Character.md
+++ b/flang/docs/Character.md
@@ -6,7 +6,7 @@
 
 -->
 
-# Implementation of `CHARACTER` types in f18
+# Implementation of `CHARACTER` types in Flang
 
 ```{contents}
 ---
@@ -16,7 +16,7 @@ local:
 
 ## Kinds and Character Sets
 
-The f18 compiler and runtime support three kinds of the intrinsic
+The Flang compiler and runtime support three kinds of the intrinsic
 `CHARACTER` type of Fortran 2018.
 The default (`CHARACTER(KIND=1)`) holds 8-bit character codes;
 `CHARACTER(KIND=2)` holds 16-bit character codes;
@@ -108,12 +108,12 @@ The result of `//` may be used
 * as the value of a specifier of an I/O statement,
 * or as the value of a statement function.
 
-The f18 compiler has a general (but slow) means of implementing concatenation
+The Flang compiler has a general (but slow) means of implementing concatenation
 and a specialized (fast) option to optimize the most common case.
 
 ### General concatenation
 
-In the most general case, the f18 compiler's generated code and
+In the most general case, Flang's generated code and
 runtime support library represent the result as a deferred-length allocatable
 `CHARACTER` temporary scalar or array variable that is initialized
 as a zero-length array by `AllocatableInitCharacter()`
diff --git a/flang/docs/DoConcurrent.md b/flang/docs/DoConcurrent.md
index bd1008a..eba2656 100644
--- a/flang/docs/DoConcurrent.md
+++ b/flang/docs/DoConcurrent.md
@@ -280,7 +280,8 @@ Specifically, an easy means is required that stipulates that localization
 should apply at most only to the obvious cases of local non-pointer
 non-allocatable scalars.
 
-In the LLVM Fortran compiler project (a/k/a "flang", "f18") we considered
+In the LLVM Fortran compiler project (now known as "flang", previously also
+known as "f18") we considered
 several solutions to this problem.
 1. Add syntax (e.g., `DO PARALLEL` or `DO CONCURRENT() DEFAULT(PARALLEL)`)
    by which one can inform the compiler that it should localize only
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 420b751..6d87209 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -84,7 +84,7 @@ end
   be "local identifiers" and should be distinct in the "inclusive
   scope" -- i.e., not scoped by `BLOCK` constructs.
   As most (but not all) compilers implement `BLOCK` scoping of construct
-  names, so does f18, with a portability warning.
+  names, so does Flang, with a portability warning.
 * 15.6.4 paragraph 2 prohibits an implicitly typed statement function
   from sharing the same name as a symbol in its scope's host, if it
   has one.
@@ -153,7 +153,7 @@ end
   that a call to intrinsic module procedure `ieee_support_halting` with
   a constant argument has a compile time constant result in `constant
   expression` and `specification expression` contexts. In compilations
-  where this information is not known at compile time, f18 generates code
+  where this information is not known at compile time, Flang generates code
   to determine the absence or presence of this capability at runtime.
   A call to `ieee_support_halting` in contexts that the standard requires
   to be constant will generate a compilation error. `ieee_support_standard`
@@ -366,7 +366,7 @@ end
 * The legacy extension intrinsic functions `IZEXT` and `JZEXT`
   are supported; `ZEXT` has different behavior with various older
   compilers, so it is not supported.
-* f18 doesn't impose a limit on the number of continuation lines
+* Flang doesn't impose a limit on the number of continuation lines
   allowed for a single statement.
 * When a type-bound procedure declaration statement has neither interface
   nor attributes, the "::" before the bindings is optional, even
@@ -553,7 +553,7 @@ end
 * Fortran explicitly ignores type declaration statements when they
   attempt to type the name of a generic intrinsic function (8.2 p3).
   One can declare `CHARACTER::COS` and still get a real result
-  from `COS(3.14159)`, for example.  f18 will complain when a
+  from `COS(3.14159)`, for example.  Flang will complain when a
   generic intrinsic function's inferred result type does not
   match an explicit declaration.  This message is a warning.
 
@@ -570,7 +570,7 @@ end
 
 ## Standard features that might as well not be
 
-* f18 supports designators with constant expressions, properly
+* Flang supports designators with constant expressions, properly
   constrained, as initial data targets for data pointers in
   initializers of variable and component declarations and in
   `DATA` statements; e.g., `REAL, POINTER :: P => T(1:10:2)`.
@@ -587,8 +587,8 @@ end
 * The standard doesn't explicitly require that a named constant that
   appears as part of a complex-literal-constant be a scalar, but
   most compilers emit an error when an array appears.
-  f18 supports them with a portability warning.
-* f18 does not enforce a blanket prohibition against generic
+  Flang supports them with a portability warning.
+* Flang does not enforce a blanket prohibition against generic
   interfaces containing a mixture of functions and subroutines.
   We allow both to appear, unlike several other Fortran compilers.
   This is especially desirable when two generics of the same
@@ -655,7 +655,7 @@ end
   treat them as references to implicitly typed local variables, and
   load uninitialized values.
 
-  In f18, we chose to emit an error message for this case since the standard
+  In Flang, we chose to emit an error message for this case since the standard
   is unclear, the usage is not portable, and the issue can be easily resolved
   by adding a declaration.
 
@@ -686,7 +686,7 @@ end
 
 * When a `DATA` statement in a `BLOCK` construct could be construed as
   either initializing a host-associated object or declaring a new local
-  initialized object, f18 interprets the standard's classification of
+  initialized object, Flang interprets the standard's classification of
   a `DATA` statement as being a "declaration" rather than a "specification"
   construct, and notes that the `BLOCK` construct is defined as localizing
   names that have specifications in the `BLOCK` construct.
@@ -703,7 +703,7 @@ end subroutine
   Other Fortran compilers disagree with each other in their interpretations
   of this example.
   The precedent among the most commonly used compilers
-  agrees with f18's interpretation: a `DATA` statement without any other
+  agrees with Flang's interpretation: a `DATA` statement without any other
   specification of the name refers to the host-associated object.
 
 * Many Fortran compilers allow a non-generic procedure to be `USE`-associated
@@ -729,7 +729,7 @@ module m2
 end module
 ```
 
-  This case elicits a warning from f18, as it should not be treated
+  This case elicits a warning from Flang, as it should not be treated
   any differently than the same case with the non-generic procedure of
   the same name being defined in the same scope rather than being
   `USE`-associated into it, which is explicitly non-conforming in the
@@ -747,7 +747,7 @@ end module
   symbols, much less appear in specification inquiries, and there are
   application codes that expect exterior symbols whose names match
   components to be visible in a derived-type definition's default initialization
-  expressions, and so f18 follows that precedent.
+  expressions, and so Flang follows that precedent.
 
 * 19.3.1p1 "Within its scope, a local identifier of an entity of class (1)
   or class (4) shall not be the same as a global identifier used in that scope..."
@@ -769,17 +769,17 @@ end module
   left-hand side for a pointer assignment statement, and we emit a
   portability warning when it is not.
 
-* F18 allows a `USE` statement to reference a module that is defined later
+* Flang allows a `USE` statement to reference a module that is defined later
   in the same compilation unit, so long as mutual dependencies do not form
   a cycle.
   This feature forestalls any risk of such a `USE` statement reading an
   obsolete module file from a previous compilation and then overwriting
   that file later.
 
-* F18 allows `OPTIONAL` dummy arguments to interoperable procedures
+* Flang allows `OPTIONAL` dummy arguments to interoperable procedures
   unless they are `VALUE` (C865).
 
-* F18 processes the `NAMELIST` group declarations in a scope after it
+* Flang processes the `NAMELIST` group declarations in a scope after it
   has resolved all of the names in that scope.  This means that names
   that appear before their local declarations do not resolve to host
   associated objects and do not elicit errors about improper redeclarations
@@ -862,11 +862,11 @@ print *, [(j,j=1,10)]
 
 * The Fortran standard doesn't mention integer overflow explicitly. In many cases,
   however, integer overflow makes programs non-conforming.
-  F18 follows other widely-used Fortran compilers. Specifically, f18 assumes
+  Flang follows other widely-used Fortran compilers. Specifically, Flang assumes
   integer overflow never occurs in address calculations and increment of
   do-variable unless the option `-fwrapv` is enabled.
 
-* Two new ieee_round_type values were added in f18 beyond the four values
+* Two new ieee_round_type values were added in Flang beyond the four values
   defined in f03 and f08: ieee_away and ieee_other. Contemporary hardware
   typically does not have support for these rounding modes;
   ieee_support_rounding calls for these values return false.
diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
index 135e6b7..9023fdc 100644
--- a/flang/docs/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -304,7 +304,7 @@ Preprocessing behavior varies across implementations and one should not depend o
 much portability.
 Preprocessing is typically requested by the use of a capitalized filename
 suffix (e.g., "foo.F90") or a compiler command line option.
-(Since the F18 compiler always runs its built-in preprocessing stage,
+(Since Flang always runs its built-in preprocessing stage,
 no special option or filename suffix is required.)
 
 ## "Object Oriented" Programming
diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md
index f9f8f64..7f3c7b2 100644
--- a/flang/docs/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -171,7 +171,7 @@ FIR is intentionally similar to SIL from the statement level up to the level of
 Program, procedure, region, and basic block all leverage code from LLVM, in much the same way as SIL. These data structures have significant investment and engineering behind their use in compilers, and it makes sense to leverage that work.
 
 * Pro: Uses LLVM data structures, pervasive in compiler projects such as LLVM, SIL, etc.
-* Pro: Get used to seeing and using LLVM, as f18 aims to be an LLVM project
+* Pro: Get used to seeing and using LLVM, as Flang aims to be an LLVM project
 * Con: Uses LLVM data structures, which the project has been avoiding
 
 #### Alternative: C++ Standard Template Library
diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
index 79af788..2d28342 100644
--- a/flang/docs/GettingInvolved.md
+++ b/flang/docs/GettingInvolved.md
@@ -41,7 +41,7 @@ Contributions to Flang are done using GitHub Pull Requests and follow the
 ### Flang Slack Workspace
 
 -   There is a Slack workspace dedicated to Flang.
--   There are a number of topic-oriented channels available (e.g., #driver, #f18-semantics, #fir).
+-   There are a number of topic-oriented channels available (e.g., #driver, #fir).
 -   Add yourself via the *[invitation link](https://join.slack.com/t/flang-compiler/shared_invite/zt-2pcn51lh-VrRQL_YUOkxA_1CEfMGQhw "title")*
 
 ## Calls
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 34b6559..bfda5f3 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -19,7 +19,7 @@ of functions or subroutines with similar interfaces as an aid to
 comprehension beyond that which might be gained from the standard's
 alphabetical list.
 
-A brief status of intrinsic procedure support in f18 is also given at the end.
+A brief status of intrinsic procedure support in Flang is also given at the end.
 
 Few procedures are actually described here apart from their interfaces; see the
 Fortran 2018 standard (section 16) for the complete story.
@@ -733,20 +733,20 @@ In case the invocation would be an error if the procedure were the intrinsic
 leaves two choices to the compiler: emit an error about the intrinsic invocation,
 or consider this is an external procedure and emit no error.
 
-f18 will always consider this case to be the intrinsic and emit errors, unless the procedure
+Flang will always consider this case to be the intrinsic and emit errors, unless the procedure
 is used as a function (resp. subroutine) and the intrinsic is a subroutine (resp. function).
 The table below gives some examples of decisions made by Fortran compilers in such case.
 
 | What is ACOS ?     | Bad intrinsic call       | External with warning |  External no warning | Other error |
 | --- | --- | --- | --- | --- |
-| `print*, ACOS()`     | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `print*, ACOS(I)`    | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `print*, ACOS(X=I)`  | gfortran, nag, xlf, f18  |  ifort                |                      | nvfortran (keyword on implicit extrenal )|
-| `print*, ACOS(X, X)` | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `CALL ACOS(X)`       |                          |                       |  gfortran, nag, xlf, nvfortran, ifort, f18  | |
+| `print*, ACOS()`     | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `print*, ACOS(I)`    | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `print*, ACOS(X=I)`  | gfortran, nag, xlf, flang  |  ifort                |                      | nvfortran (keyword on implicit extrenal )|
+| `print*, ACOS(X, X)` | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `CALL ACOS(X)`       |                          |                       |  gfortran, nag, xlf, nvfortran, ifort, flang  | |
 
 
-The rationale for f18 behavior is that when referring to a procedure with an
+The rationale for Flang behavior is that when referring to a procedure with an
 argument number or type that does not match the intrinsic specification, it seems safer to block
 the rather likely case where the user is using the intrinsic the wrong way.
 In case the user wanted to refer to an external function, he can add an explicit EXTERNAL
@@ -759,13 +759,13 @@ Also note that in general, the standard gives the compiler the right to consider
 any procedure that is not explicitly external as a non standard intrinsic (section 4.2 point 4).
 So it is highly advised for the programmer to use EXTERNAL statements to prevent any ambiguity.
 
-## Intrinsic Procedure Support in f18
-This section gives an overview of the support inside f18 libraries for the
+## Intrinsic Procedure Support in Flang
+This section gives an overview of the support inside Flang libraries for the
 intrinsic procedures listed above.
-It may be outdated, refer to f18 code base for the actual support status.
+It may be outdated, refer to Flang code base for the actual support status.
 
 ### Semantic Analysis
-F18 semantic expression analysis phase detects intrinsic procedure references,
+Flang semantic expression analysis phase detects intrinsic procedure references,
 validates the argument types and deduces the return types.
 This phase currently supports all the intrinsic procedures listed above but the ones in the table below.
 
@@ -789,17 +789,17 @@ Constant Expressions may be used to define kind arguments. Therefore, the semant
 expression analysis phase must be able to fold references to intrinsic functions
 listed in section 10.1.12.
 
-F18 intrinsic function folding is either performed by implementations directly
-operating on f18 scalar types or by using host runtime functions and
-host hardware types. F18 supports folding elemental intrinsic functions over
+Flang intrinsic function folding is either performed by implementations directly
+operating on Flang scalar types or by using host runtime functions and
+host hardware types. Flang supports folding elemental intrinsic functions over
 arrays when an implementation is provided for the scalars (regardless of whether
 it is using host hardware types or not).
 The status of intrinsic function folding support is given in the sub-sections below.
 
 #### Intrinsic Functions with Host Independent Folding Support
-Implementations using f18 scalar types enables folding intrinsic functions
-on any host and with any possible type kind supported by f18. The intrinsic functions
-listed below are folded using host independent implementations.
+Implementations using Flang scalar types enables folding intrinsic functions
+on any host and with any possible type kind supported by Flang. The intrinsic
+functions listed below are folded using host independent implementations.
 
 | Return Type | Intrinsic Functions with Host Independent Folding Support|
 | --- | --- |
@@ -810,12 +810,12 @@ listed below are folded using host independent implementations.
 
 #### Intrinsic Functions with Host Dependent Folding Support
 Implementations using the host runtime may not be available for all supported
-f18 types depending on the host hardware types and the libraries available on the host.
+Flang types depending on the hardware type of the host and the libraries available on it.
 The actual support on a host depends on what the host hardware types are.
 The list below gives the functions that are folded using host runtime and the related C/C++ types.
-F18 automatically detects if these types match an f18 scalar type. If so,
-folding of the intrinsic functions will be possible for the related f18 scalar type,
-otherwise an error message will be produced by f18 when attempting to fold related intrinsic functions.
+Flang automatically detects if these types match an Flang scalar type. If so,
+folding of the intrinsic functions will be possible for the related Flang scalar type,
+otherwise an error message will be produced by Flang when attempting to fold related intrinsic functions.
 
 | C/C++ Host Type | Intrinsic Functions with Host Standard C++ Library Based Folding Support |
 | --- | --- |
@@ -823,17 +823,17 @@ otherwise an error message will be produced by f18 when attempting to fold relat
 | std::complex for float, double and long double| ACOS, ACOSH, ASIN, ASINH, ATAN, ATANH, COS, COSH, EXP, LOG, SIN, SINH, SQRT, TAN, TANH |
 
 On top of the default usage of C++ standard library functions for folding described
-in the table above, it is possible to compile f18 evaluate library with
+in the table above, it is possible to compile Flang evaluate library with
 [libpgmath](https://github.com/flang-compiler/flang/tree/master/runtime/libpgmath)
 so that it can be used for folding. To do so, one must have a compiled version
 of the libpgmath library available on the host and add
-`-DLIBPGMATH_DIR=<path to the compiled shared libpgmath library>` to the f18 cmake command.
+`-DLIBPGMATH_DIR=<path to the compiled shared libpgmath library>` to the Flang cmake command.
 
 Libpgmath comes with real and complex functions that replace C++ standard library
 float and double functions to fold all the intrinsic functions listed in the table above.
-It has no long double versions. If the host long double matches an f18 scalar type,
+It has no long double versions. If the host long double matches a Flang scalar type,
 C++ standard library functions will still be used for folding expressions with this scalar type.
-Libpgmath adds the possibility to fold the following functions for f18 real scalar
+Libpgmath adds the possibility to fold the following functions for Flang's real scalar
 types related to host float and double types.
 
 | C/C++ Host Type | Additional Intrinsic Function Folding Support with Libpgmath (Optional) |
@@ -841,10 +841,10 @@ types related to host float and double types.
 |float and double| BESSEL_J0, BESSEL_J1, BESSEL_JN (elemental only), BESSEL_Y0, BESSEL_Y1, BESSEL_Yn (elemental only), DERFC_SCALED, ERFC_SCALED, QERFC_SCALED |
 
 Libpgmath comes in three variants (precise, relaxed and fast). So far, only the
-precise version is used for intrinsic function folding in f18. It guarantees the greatest numerical precision.
+precise version is used for intrinsic function folding in Flang. It guarantees the greatest numerical precision.
 
 ### Intrinsic Functions with Missing Folding Support
-The following intrinsic functions are allowed in constant expressions but f18
+The following intrinsic functions are allowed in constant expressions but Flang
 is not yet able to fold them. Note that there might be constraints on the arguments
 so that these intrinsics can be used in constant expressions (see section 10.1.12 of Fortran 2018 standard).
 
@@ -1133,8 +1133,8 @@ end program rename_proc
 - **Standard:**  GNU extension
 - **Class:**     function
 - **Syntax:**    result = `SECNDS(refTime)`
-- **Arguments:** 
- 
+- **Arguments:**
+
 | ARGUMENT  | INTENT |      TYPE     |          KIND           |           Description                    |
 |-----------|--------|---------------|-------------------------|------------------------------------------|
 | `refTime` | `IN`   | `REAL, scalar`| REAL(KIND=4), required  | Reference time in seconds since midnight |
@@ -1157,16 +1157,16 @@ END PROGRAM example_secnds
 since midnight minus a user-supplied reference time `refTime`. Uses `REAL(KIND=8)` for higher precision.
 
 #### Usage and Info
-- **Standard:** PGI extension  
-- **Class:**     function  
-- **Syntax:**    result = `DSECNDS(refTime)`  
-- **Arguments:** 
+- **Standard:** PGI extension
+- **Class:**     function
+- **Syntax:**    result = `DSECNDS(refTime)`
+- **Arguments:**
 
 | ARGUMENT  | INTENT |      TYPE     |          KIND           |           Description                    |
 |-----------|--------|---------------|-------------------------|------------------------------------------|
 | `refTime` | `IN`   | `REAL, scalar`| REAL(KIND=8), required  | Reference time in seconds since midnight |
 
-- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`.  
+- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`.
 - **Purity:** Impure
 
 #### Example
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index f96d222..9f7c10c 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -98,6 +98,13 @@ public:
   mlir::Type getElementOrSequenceType() const {
     return hlfir::getFortranElementOrSequenceType(getType());
   }
+  /// Return the fir.class or fir.box type needed to describe this entity.
+  fir::BaseBoxType getBoxType() const {
+    if (isBoxAddressOrValue())
+      return llvm::cast<fir::BaseBoxType>(fir::unwrapRefType(getType()));
+    const bool isVolatile = fir::isa_volatile_type(getType());
+    return fir::BoxType::get(getElementOrSequenceType(), isVolatile);
+  }
 
   bool hasLengthParameters() const {
     mlir::Type eleTy = getFortranElementType();
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index fc6eedc..86502c6 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -1249,6 +1249,41 @@ def fir_IsAssumedSizeOp : fir_SimpleOp<"is_assumed_size", [NoMemoryEffect]> {
   let results = (outs BoolLike);
 }
 
+def fir_AssumedSizeExtentOp : fir_SimpleOneResultOp<"assumed_size_extent", [NoMemoryEffect]> {
+  let summary = "get the assumed-size last extent sentinel";
+
+  let description = [{
+    Returns the special extent value representing the last dimension of an
+    assumed-size array. This is used to model the semantics in FIR without
+    directly materializing the sentinel value. The concrete encoding is
+    introduced during FIR to LLVM lowering.
+
+    ```
+      %e = fir.assumed_size_extent : index
+    ```
+  }];
+
+  let results = (outs Index);
+  let assemblyFormat = "attr-dict `:` type(results)";
+}
+
+def fir_IsAssumedSizeExtentOp : fir_SimpleOp<"is_assumed_size_extent", [NoMemoryEffect]> {
+  let summary = "is value the assumed-size last extent sentinel";
+
+  let description = [{
+    Returns true iff the given integer equals the assumed-size extent sentinel.
+
+    ```
+      %t = fir.is_assumed_size_extent %v : (index) -> i1
+      %c = fir.is_assumed_size_extent %x : (i32) -> i1
+    ```
+  }];
+
+  let arguments = (ins AnyIntegerLike:$val);
+  let results = (outs BoolLike);
+  let hasCanonicalizer = 1;
+}
+
 def fir_BoxIsPtrOp : fir_SimpleOp<"box_isptr", [NoMemoryEffect]> {
   let summary = "is the boxed value a POINTER?";
 
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index 682dd82..70b9341 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -22,6 +22,7 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 0f85183..7539d12 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -13,9 +13,11 @@
 #ifndef FORTRAN_SEMANTICS_OPENMP_UTILS_H
 #define FORTRAN_SEMANTICS_OPENMP_UTILS_H
 
+#include "flang/Common/indirection.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/char-block.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
 #include "flang/Semantics/tools.h"
 
 #include "llvm/ADT/ArrayRef.h"
@@ -74,7 +76,11 @@ bool IsVarOrFunctionRef(const MaybeExpr &expr);
 bool IsMapEnteringType(parser::OmpMapType::Value type);
 bool IsMapExitingType(parser::OmpMapType::Value type);
 
-std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr);
+MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr);
+template <typename T> MaybeExpr GetEvaluateExpr(const T &inp) {
+  return GetEvaluateExpr(parser::UnwrapRef<parser::Expr>(inp));
+}
+
 std::optional<evaluate::DynamicType> GetDynamicType(
     const parser::Expr &parserExpr);
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 3b711cc..a516a44 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1766,7 +1766,7 @@ private:
       // to a crash due to a block with no terminator. See issue #126452.
       mlir::FunctionType funcType = builder->getFunction().getFunctionType();
       mlir::Type resultType = funcType.getResult(0);
-      mlir::Value undefResult = builder->create<fir::UndefOp>(loc, resultType);
+      mlir::Value undefResult = fir::UndefOp::create(*builder, loc, resultType);
       genExitRoutine(false, undefResult);
       return;
     }
@@ -4010,8 +4010,8 @@ private:
       // parameters and dynamic type. The selector cannot be a
       // POINTER/ALLOCATBLE as per F'2023 C1160.
       fir::ExtendedValue newExv;
-      llvm::SmallVector assumeSizeExtents{
-          builder->createMinusOneInteger(loc, builder->getIndexType())};
+      llvm::SmallVector<mlir::Value> assumeSizeExtents{
+          fir::AssumedSizeExtentOp::create(*builder, loc)};
       mlir::Value baseAddr =
           hlfir::genVariableRawAddress(loc, *builder, selector);
       const bool isVolatile = fir::isa_volatile_type(selector.getType());
@@ -4733,11 +4733,21 @@ private:
       return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, {});
     hlfir::Entity rhs = Fortran::lower::convertExprToHLFIR(
         loc, *this, assign.rhs, localSymbols, rhsContext);
+    auto rhsBoxType = rhs.getBoxType();
     // Create pointer descriptor value from the RHS.
     if (rhs.isMutableBox())
       rhs = hlfir::Entity{fir::LoadOp::create(*builder, loc, rhs)};
-    mlir::Value rhsBox = hlfir::genVariableBox(
-        loc, *builder, rhs, lhsBoxType.getBoxTypeWithNewShape(rhs.getRank()));
+
+    // Use LHS type if LHS is not polymorphic.
+    fir::BaseBoxType targetBoxType;
+    if (assign.lhs.GetType()->IsPolymorphic())
+      targetBoxType = rhsBoxType.getBoxTypeWithNewAttr(
+          fir::BaseBoxType::Attribute::Pointer);
+    else
+      targetBoxType = lhsBoxType.getBoxTypeWithNewShape(rhs.getRank());
+    mlir::Value rhsBox =
+        hlfir::genVariableBox(loc, *builder, rhs, targetBoxType);
+
     // Apply lower bounds or reshaping if any.
     if (const auto *lbExprs =
             std::get_if<Fortran::evaluate::Assignment::BoundsSpec>(&assign.u);
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 00ec1b5..2517ab3 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1711,7 +1711,7 @@ static void lowerExplicitLowerBounds(
 /// CFI_desc_t requirements in 18.5.3 point 5.).
 static mlir::Value getAssumedSizeExtent(mlir::Location loc,
                                         fir::FirOpBuilder &builder) {
-  return builder.createMinusOneInteger(loc, builder.getIndexType());
+  return fir::AssumedSizeExtentOp::create(builder, loc);
 }
 
 /// Lower explicit extents into \p result if this is an explicit-shape or
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index a49961c..7106728 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2059,37 +2059,38 @@ static void genCanonicalLoopNest(
     // Start lowering
     mlir::Value zero = firOpBuilder.createIntegerConstant(loc, loopVarType, 0);
     mlir::Value one = firOpBuilder.createIntegerConstant(loc, loopVarType, 1);
-    mlir::Value isDownwards = firOpBuilder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
+    mlir::Value isDownwards = mlir::arith::CmpIOp::create(
+        firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
 
     // Ensure we are counting upwards. If not, negate step and swap lb and ub.
     mlir::Value negStep =
-        firOpBuilder.create<mlir::arith::SubIOp>(loc, zero, loopStepVar);
-    mlir::Value incr = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, negStep, loopStepVar);
-    mlir::Value lb = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, loopUBVar, loopLBVar);
-    mlir::Value ub = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, loopLBVar, loopUBVar);
+        mlir::arith::SubIOp::create(firOpBuilder, loc, zero, loopStepVar);
+    mlir::Value incr = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, negStep, loopStepVar);
+    mlir::Value lb = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, loopUBVar, loopLBVar);
+    mlir::Value ub = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, loopLBVar, loopUBVar);
 
     // Compute the trip count assuming lb <= ub. This guarantees that the result
     // is non-negative and we can use unsigned arithmetic.
-    mlir::Value span = firOpBuilder.create<mlir::arith::SubIOp>(
-        loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
+    mlir::Value span = mlir::arith::SubIOp::create(
+        firOpBuilder, loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
     mlir::Value tcMinusOne =
-        firOpBuilder.create<mlir::arith::DivUIOp>(loc, span, incr);
-    mlir::Value tcIfLooping = firOpBuilder.create<mlir::arith::AddIOp>(
-        loc, tcMinusOne, one, ::mlir::arith::IntegerOverflowFlags::nuw);
+        mlir::arith::DivUIOp::create(firOpBuilder, loc, span, incr);
+    mlir::Value tcIfLooping =
+        mlir::arith::AddIOp::create(firOpBuilder, loc, tcMinusOne, one,
+                                    ::mlir::arith::IntegerOverflowFlags::nuw);
 
     // Fall back to 0 if lb > ub
-    mlir::Value isZeroTC = firOpBuilder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, ub, lb);
-    mlir::Value tripcount = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isZeroTC, zero, tcIfLooping);
+    mlir::Value isZeroTC = mlir::arith::CmpIOp::create(
+        firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, ub, lb);
+    mlir::Value tripcount = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isZeroTC, zero, tcIfLooping);
     tripcounts.push_back(tripcount);
 
     // Create the CLI handle.
-    auto newcli = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto newcli = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     mlir::Value cli = newcli.getResult();
     clis.push_back(cli);
 
@@ -2122,10 +2123,10 @@ static void genCanonicalLoopNest(
                "Expecting all block args to have been collected by now");
         for (auto j : llvm::seq<size_t>(numLoops)) {
           mlir::Value natIterNum = fir::getBase(blockArgs[j]);
-          mlir::Value scaled = firOpBuilder.create<mlir::arith::MulIOp>(
-              loc, natIterNum, loopStepVars[j]);
-          mlir::Value userVal = firOpBuilder.create<mlir::arith::AddIOp>(
-              loc, loopLBVars[j], scaled);
+          mlir::Value scaled = mlir::arith::MulIOp::create(
+              firOpBuilder, loc, natIterNum, loopStepVars[j]);
+          mlir::Value userVal = mlir::arith::AddIOp::create(
+              firOpBuilder, loc, loopLBVars[j], scaled);
 
           mlir::OpBuilder::InsertPoint insPt =
               firOpBuilder.saveInsertionPoint();
@@ -2198,9 +2199,9 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
   gridGeneratees.reserve(numLoops);
   intratileGeneratees.reserve(numLoops);
   for ([[maybe_unused]] auto i : llvm::seq<int>(0, sizesClause.sizes.size())) {
-    auto gridCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto gridCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     gridGeneratees.push_back(gridCLI.getResult());
-    auto intratileCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto intratileCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     intratileGeneratees.push_back(intratileCLI.getResult());
   }
 
@@ -2209,8 +2210,8 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
   generatees.append(gridGeneratees);
   generatees.append(intratileGeneratees);
 
-  firOpBuilder.create<mlir::omp::TileOp>(loc, generatees, applyees,
-                                         sizesClause.sizes);
+  mlir::omp::TileOp::create(firOpBuilder, loc, generatees, applyees,
+                            sizesClause.sizes);
 }
 
 static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 29eedfb..d2a36d4 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -989,9 +989,18 @@ static constexpr IntrinsicHandler handlers[]{
        {"mask", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
     {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false},
-    {"syncthreads_and", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_count", &I::genSyncThreadsCount, {}, /*isElemental=*/false},
-    {"syncthreads_or", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
+    {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
+    {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
+    {"syncthreads_count_i4",
+     &I::genSyncThreadsCount,
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_l4",
+     &I::genSyncThreadsCount,
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
+    {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
     {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false},
     {"system",
      &I::genSystem,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 70bb43a2..478ab15 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -749,6 +749,44 @@ struct VolatileCastOpConversion
   }
 };
 
+/// Lower `fir.assumed_size_extent` to constant -1 of index type.
+struct AssumedSizeExtentOpConversion
+    : public fir::FIROpConversion<fir::AssumedSizeExtentOp> {
+  using FIROpConversion::FIROpConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(fir::AssumedSizeExtentOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    mlir::Type ity = lowerTy().indexType();
+    auto cst = fir::genConstantIndex(loc, ity, rewriter, -1);
+    rewriter.replaceOp(op, cst.getResult());
+    return mlir::success();
+  }
+};
+
+/// Lower `fir.is_assumed_size_extent` to integer equality with -1.
+struct IsAssumedSizeExtentOpConversion
+    : public fir::FIROpConversion<fir::IsAssumedSizeExtentOp> {
+  using FIROpConversion::FIROpConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(fir::IsAssumedSizeExtentOp op, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    mlir::Value val = adaptor.getVal();
+    mlir::Type valTy = val.getType();
+    // Create constant -1 of the operand type.
+    auto negOneAttr = rewriter.getIntegerAttr(valTy, -1);
+    auto negOne =
+        mlir::LLVM::ConstantOp::create(rewriter, loc, valTy, negOneAttr);
+    auto cmp = mlir::LLVM::ICmpOp::create(
+        rewriter, loc, mlir::LLVM::ICmpPredicate::eq, val, negOne);
+    rewriter.replaceOp(op, cmp.getResult());
+    return mlir::success();
+  }
+};
+
 /// convert value of from-type to value of to-type
 struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
   using FIROpConversion::FIROpConversion;
@@ -1113,7 +1151,7 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
     mlir::Value size = genTypeSizeInBytes(loc, ity, rewriter, llvmObjectTy);
     if (auto scaleSize =
             fir::genAllocationScaleSize(loc, heap.getInType(), ity, rewriter))
-      size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+      size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
     for (mlir::Value opnd : adaptor.getOperands())
       size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size,
                                        integerCast(loc, rewriter, ity, opnd));
@@ -4360,6 +4398,7 @@ void fir::populateFIRToLLVMConversionPatterns(
       AllocaOpConversion, AllocMemOpConversion, BoxAddrOpConversion,
       BoxCharLenOpConversion, BoxDimsOpConversion, BoxEleSizeOpConversion,
       BoxIsAllocOpConversion, BoxIsArrayOpConversion, BoxIsPtrOpConversion,
+      AssumedSizeExtentOpConversion, IsAssumedSizeExtentOpConversion,
       BoxOffsetOpConversion, BoxProcHostOpConversion, BoxRankOpConversion,
       BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion,
       CmpcOpConversion, VolatileCastOpConversion, ConvertOpConversion,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index 381b2a2..f74d635 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -242,10 +242,11 @@ struct TargetAllocMemOpConversion
         loc, llvmObjectTy, ity, rewriter, lowerTy().getDataLayout());
     if (auto scaleSize = fir::genAllocationScaleSize(
             loc, allocmemOp.getInType(), ity, rewriter))
-      size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+      size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
     for (mlir::Value opnd : adaptor.getOperands().drop_front())
-      size = rewriter.create<mlir::LLVM::MulOp>(
-          loc, ity, size, integerCast(lowerTy(), loc, rewriter, ity, opnd));
+      size = mlir::LLVM::MulOp::create(
+          rewriter, loc, ity, size,
+          integerCast(lowerTy(), loc, rewriter, ity, opnd));
     auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index ac285b5..0776346 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -872,6 +872,14 @@ public:
       }
     }
 
+    // Count the number of arguments that have to stay in place at the end of
+    // the argument list.
+    unsigned trailingArgs = 0;
+    if constexpr (std::is_same_v<FuncOpTy, mlir::gpu::GPUFuncOp>) {
+      trailingArgs =
+          func.getNumWorkgroupAttributions() + func.getNumPrivateAttributions();
+    }
+
     // Convert return value(s)
     for (auto ty : funcTy.getResults())
       llvm::TypeSwitch<mlir::Type>(ty)
@@ -981,6 +989,16 @@ public:
       }
     }
 
+    // Add the argument at the end if the number of trailing arguments is 0,
+    // otherwise insert the argument at the appropriate index.
+    auto addOrInsertArgument = [&](mlir::Type ty, mlir::Location loc) {
+      unsigned inputIndex = func.front().getArguments().size() - trailingArgs;
+      auto newArg = trailingArgs == 0
+                        ? func.front().addArgument(ty, loc)
+                        : func.front().insertArgument(inputIndex, ty, loc);
+      return newArg;
+    };
+
     if (!func.empty()) {
       // If the function has a body, then apply the fixups to the arguments and
       // return ops as required. These fixups are done in place.
@@ -1117,8 +1135,7 @@ public:
           // original arguments. (Boxchar arguments.)
           auto newBufArg =
               func.front().insertArgument(fixup.index, fixupType, loc);
-          auto newLenArg =
-              func.front().addArgument(trailingTys[fixup.second], loc);
+          auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc);
           auto boxTy = oldArgTys[fixup.index - offset];
           rewriter->setInsertionPointToStart(&func.front());
           auto box = fir::EmboxCharOp::create(*rewriter, loc, boxTy, newBufArg,
@@ -1133,8 +1150,7 @@ public:
           // appended after all the original arguments.
           auto newProcPointerArg =
               func.front().insertArgument(fixup.index, fixupType, loc);
-          auto newLenArg =
-              func.front().addArgument(trailingTys[fixup.second], loc);
+          auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc);
           auto tupleType = oldArgTys[fixup.index - offset];
           rewriter->setInsertionPointToStart(&func.front());
           fir::FirOpBuilder builder(*rewriter, getModule());
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 1712af1..d0164f3 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -5143,6 +5143,34 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns(
 }
 
 //===----------------------------------------------------------------------===//
+// IsAssumedSizeExtentOp and AssumedSizeExtentOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct FoldIsAssumedSizeExtentOnCtor
+    : public mlir::OpRewritePattern<fir::IsAssumedSizeExtentOp> {
+  using mlir::OpRewritePattern<fir::IsAssumedSizeExtentOp>::OpRewritePattern;
+  mlir::LogicalResult
+  matchAndRewrite(fir::IsAssumedSizeExtentOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(
+            op.getVal().getDefiningOp())) {
+      mlir::Type i1 = rewriter.getI1Type();
+      rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(
+          op, i1, rewriter.getIntegerAttr(i1, 1));
+      return mlir::success();
+    }
+    return mlir::failure();
+  }
+};
+} // namespace
+
+void fir::IsAssumedSizeExtentOp::getCanonicalizationPatterns(
+    mlir::RewritePatternSet &patterns, mlir::MLIRContext *context) {
+  patterns.add<FoldIsAssumedSizeExtentOnCtor>(context);
+}
+
+//===----------------------------------------------------------------------===//
 // LocalitySpecifierOp
 //===----------------------------------------------------------------------===//
 
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
index 4840a99..0d135a9 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -39,13 +39,13 @@ public:
 
   static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc,
                                mlir::Value value) {
-    return builder.create<fir::LoadOp>(loc, value);
+    return fir::LoadOp::create(builder, loc, value);
   }
 
   static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc,
                                    mlir::Value value) {
-    auto alloca = builder.create<fir::AllocaOp>(loc, value.getType());
-    builder.create<fir::StoreOp>(loc, value, alloca);
+    auto alloca = fir::AllocaOp::create(builder, loc, value.getType());
+    fir::StoreOp::create(builder, loc, value, alloca);
     return alloca;
   }
 };
diff --git a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
index 817434f..5793d46 100644
--- a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
+++ b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
@@ -130,8 +130,8 @@ class AutomapToTargetDataPass
           builder.getBoolAttr(false));
       clauses.mapVars.push_back(mapInfo);
       isa<fir::StoreOp>(memOp)
-          ? builder.create<omp::TargetEnterDataOp>(memOp.getLoc(), clauses)
-          : builder.create<omp::TargetExitDataOp>(memOp.getLoc(), clauses);
+          ? omp::TargetEnterDataOp::create(builder, memOp.getLoc(), clauses)
+          : omp::TargetExitDataOp::create(builder, memOp.getLoc(), clauses);
     };
 
     for (fir::GlobalOp globalOp : automapGlobals) {
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 65a23be..1229018 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -595,7 +595,7 @@ private:
               mlir::omp::TargetOperands &clauseOps,
               mlir::omp::LoopNestOperands &loopNestClauseOps,
               const LiveInShapeInfoMap &liveInShapeInfoMap) const {
-    auto targetOp = rewriter.create<mlir::omp::TargetOp>(loc, clauseOps);
+    auto targetOp = mlir::omp::TargetOp::create(rewriter, loc, clauseOps);
     auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
 
     mlir::Region &region = targetOp.getRegion();
@@ -672,7 +672,7 @@ private:
     // temporary.
     Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
     rewriter.setInsertionPoint(
-        rewriter.create<mlir::omp::TerminatorOp>(targetOp.getLoc()));
+        mlir::omp::TerminatorOp::create(rewriter, targetOp.getLoc()));
 
     return targetOp;
   }
@@ -715,8 +715,8 @@ private:
 
         auto shapeShiftType = fir::ShapeShiftType::get(
             builder.getContext(), shapeShiftOperands.size() / 2);
-        return builder.create<fir::ShapeShiftOp>(
-            liveInArg.getLoc(), shapeShiftType, shapeShiftOperands);
+        return fir::ShapeShiftOp::create(builder, liveInArg.getLoc(),
+                                         shapeShiftType, shapeShiftOperands);
       }
 
       llvm::SmallVector<mlir::Value> shapeOperands;
@@ -728,11 +728,11 @@ private:
         ++shapeIdx;
       }
 
-      return builder.create<fir::ShapeOp>(liveInArg.getLoc(), shapeOperands);
+      return fir::ShapeOp::create(builder, liveInArg.getLoc(), shapeOperands);
     }();
 
-    return builder.create<hlfir::DeclareOp>(liveInArg.getLoc(), liveInArg,
-                                            liveInName, shape);
+    return hlfir::DeclareOp::create(builder, liveInArg.getLoc(), liveInArg,
+                                    liveInName, shape);
   }
 
   mlir::omp::TeamsOp genTeamsOp(mlir::ConversionPatternRewriter &rewriter,
@@ -742,13 +742,13 @@ private:
     genReductions(rewriter, mapper, loop, teamsOps);
 
     mlir::Location loc = loop.getLoc();
-    auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(loc, teamsOps);
+    auto teamsOp = mlir::omp::TeamsOp::create(rewriter, loc, teamsOps);
     Fortran::common::openmp::EntryBlockArgs teamsArgs;
     teamsArgs.reduction.vars = teamsOps.reductionVars;
     Fortran::common::openmp::genEntryBlock(rewriter, teamsArgs,
                                            teamsOp.getRegion());
 
-    rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+    rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
 
     for (auto [loopVar, teamsArg] : llvm::zip_equal(
              loop.getReduceVars(), teamsOp.getRegion().getArguments())) {
@@ -761,8 +761,8 @@ private:
   mlir::omp::DistributeOp
   genDistributeOp(mlir::Location loc,
                   mlir::ConversionPatternRewriter &rewriter) const {
-    auto distOp = rewriter.create<mlir::omp::DistributeOp>(
-        loc, /*clauses=*/mlir::omp::DistributeOperands{});
+    auto distOp = mlir::omp::DistributeOp::create(
+        rewriter, loc, /*clauses=*/mlir::omp::DistributeOperands{});
 
     rewriter.createBlock(&distOp.getRegion());
     return distOp;
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
index 8a9b383..7b61539 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
@@ -282,14 +282,14 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) {
           &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {});
       for (auto arg : teamsBlock->getArguments())
         newTeamsBlock->addArgument(arg.getType(), arg.getLoc());
-      auto newWorkdistribute = rewriter.create<omp::WorkdistributeOp>(loc);
-      rewriter.create<omp::TerminatorOp>(loc);
+      auto newWorkdistribute = omp::WorkdistributeOp::create(rewriter, loc);
+      omp::TerminatorOp::create(rewriter, loc);
       rewriter.createBlock(&newWorkdistribute.getRegion(),
                            newWorkdistribute.getRegion().begin(), {}, {});
       auto *cloned = rewriter.clone(*parallelize);
       parallelize->replaceAllUsesWith(cloned);
       parallelize->erase();
-      rewriter.create<omp::TerminatorOp>(loc);
+      omp::TerminatorOp::create(rewriter, loc);
       changed = true;
     }
   }
@@ -298,10 +298,10 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) {
 
 /// Generate omp.parallel operation with an empty region.
 static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) {
-  auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
+  auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc);
   parallelOp.setComposite(composite);
   rewriter.createBlock(&parallelOp.getRegion());
-  rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+  rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
   return;
 }
 
@@ -309,7 +309,7 @@ static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) {
 static void genDistributeOp(Location loc, OpBuilder &rewriter, bool composite) {
   mlir::omp::DistributeOperands distributeClauseOps;
   auto distributeOp =
-      rewriter.create<mlir::omp::DistributeOp>(loc, distributeClauseOps);
+      mlir::omp::DistributeOp::create(rewriter, loc, distributeClauseOps);
   distributeOp.setComposite(composite);
   auto distributeBlock = rewriter.createBlock(&distributeOp.getRegion());
   rewriter.setInsertionPointToStart(distributeBlock);
@@ -334,12 +334,12 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop,
                         const mlir::omp::LoopNestOperands &clauseOps,
                         bool composite) {
 
-  auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+  auto wsloopOp = mlir::omp::WsloopOp::create(rewriter, doLoop.getLoc());
   wsloopOp.setComposite(composite);
   rewriter.createBlock(&wsloopOp.getRegion());
 
   auto loopNestOp =
-      rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+      mlir::omp::LoopNestOp::create(rewriter, doLoop.getLoc(), clauseOps);
 
   // Clone the loop's body inside the loop nest construct using the
   // mapped values.
@@ -351,7 +351,7 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop,
   // Erase fir.result op of do loop and create yield op.
   if (auto resultOp = dyn_cast<fir::ResultOp>(terminatorOp)) {
     rewriter.setInsertionPoint(terminatorOp);
-    rewriter.create<mlir::omp::YieldOp>(doLoop->getLoc());
+    mlir::omp::YieldOp::create(rewriter, doLoop->getLoc());
     terminatorOp->erase();
   }
 }
@@ -494,15 +494,15 @@ static SmallVector<Value> convertFlatToMultiDim(OpBuilder &builder,
   // Convert flat index to multi-dimensional indices
   SmallVector<Value> indices(rank);
   Value temp = flatIdx;
-  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  auto c1 = arith::ConstantIndexOp::create(builder, loc, 1);
 
   // Work backwards through dimensions (row-major order)
   for (int i = rank - 1; i >= 0; --i) {
-    Value zeroBasedIdx = builder.create<arith::RemSIOp>(loc, temp, extents[i]);
+    Value zeroBasedIdx = arith::RemSIOp::create(builder, loc, temp, extents[i]);
     // Convert to one-based index
-    indices[i] = builder.create<arith::AddIOp>(loc, zeroBasedIdx, c1);
+    indices[i] = arith::AddIOp::create(builder, loc, zeroBasedIdx, c1);
     if (i > 0) {
-      temp = builder.create<arith::DivSIOp>(loc, temp, extents[i]);
+      temp = arith::DivSIOp::create(builder, loc, temp, extents[i]);
     }
   }
 
@@ -525,7 +525,7 @@ static Value CalculateTotalElements(OpBuilder &builder, Location loc,
     if (i == 0) {
       totalElems = extent;
     } else {
-      totalElems = builder.create<arith::MulIOp>(loc, totalElems, extent);
+      totalElems = arith::MulIOp::create(builder, loc, totalElems, extent);
     }
   }
   return totalElems;
@@ -562,14 +562,14 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc,
   // Load destination array box (if it's a reference)
   Value arrayBox = destBox;
   if (isa<fir::ReferenceType>(destBox.getType()))
-    arrayBox = builder.create<fir::LoadOp>(loc, destBox);
+    arrayBox = fir::LoadOp::create(builder, loc, destBox);
 
-  auto scalarValue = builder.create<fir::BoxAddrOp>(loc, srcBox);
-  Value scalar = builder.create<fir::LoadOp>(loc, scalarValue);
+  auto scalarValue = fir::BoxAddrOp::create(builder, loc, srcBox);
+  Value scalar = fir::LoadOp::create(builder, loc, scalarValue);
 
   // Calculate total number of elements (flattened)
-  auto c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
-  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  auto c0 = arith::ConstantIndexOp::create(builder, loc, 0);
+  auto c1 = arith::ConstantIndexOp::create(builder, loc, 1);
   Value totalElems = CalculateTotalElements(builder, loc, arrayBox);
 
   auto *workdistributeBlock = &workdistribute.getRegion().front();
@@ -587,7 +587,7 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc,
       builder, loc, fir::ReferenceType::get(scalar.getType()), arrayBox,
       nullptr, nullptr, ValueRange{indices}, ValueRange{});
 
-  builder.create<fir::StoreOp>(loc, scalar, elemPtr);
+  fir::StoreOp::create(builder, loc, scalar, elemPtr);
 }
 
 /// workdistributeRuntimeCallLower method finds the runtime calls
@@ -749,14 +749,15 @@ FailureOr<omp::TargetOp> splitTargetData(omp::TargetOp targetOp,
   auto deviceAddrVars = targetOp.getHasDeviceAddrVars();
   auto devicePtrVars = targetOp.getIsDevicePtrVars();
   // Create the target data op
-  auto targetDataOp = rewriter.create<omp::TargetDataOp>(
-      loc, device, ifExpr, outerMapInfos, deviceAddrVars, devicePtrVars);
+  auto targetDataOp =
+      omp::TargetDataOp::create(rewriter, loc, device, ifExpr, outerMapInfos,
+                                deviceAddrVars, devicePtrVars);
   auto taregtDataBlock = rewriter.createBlock(&targetDataOp.getRegion());
-  rewriter.create<mlir::omp::TerminatorOp>(loc);
+  mlir::omp::TerminatorOp::create(rewriter, loc);
   rewriter.setInsertionPointToStart(taregtDataBlock);
   // Create the inner target op
-  auto newTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  auto newTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
@@ -821,19 +822,19 @@ static TempOmpVar allocateTempOmpVar(Location loc, Type ty,
   // Get the appropriate type for allocation
   if (isPtr(ty)) {
     Type intTy = rewriter.getI32Type();
-    auto one = rewriter.create<LLVM::ConstantOp>(loc, intTy, 1);
+    auto one = LLVM::ConstantOp::create(rewriter, loc, intTy, 1);
     allocType = llvmPtrTy;
-    alloc = rewriter.create<LLVM::AllocaOp>(loc, llvmPtrTy, allocType, one);
+    alloc = LLVM::AllocaOp::create(rewriter, loc, llvmPtrTy, allocType, one);
     allocType = intTy;
   } else {
     allocType = ty;
-    alloc = rewriter.create<fir::AllocaOp>(loc, allocType);
+    alloc = fir::AllocaOp::create(rewriter, loc, allocType);
   }
   // Lambda to create mapinfo ops
   auto getMapInfo = [&](mlir::omp::ClauseMapFlags mappingFlags,
                         const char *name) {
-    return rewriter.create<omp::MapInfoOp>(
-        loc, alloc.getType(), alloc, TypeAttr::get(allocType),
+    return omp::MapInfoOp::create(
+        rewriter, loc, alloc.getType(), alloc, TypeAttr::get(allocType),
         rewriter.getAttr<omp::ClauseMapFlagsAttr>(mappingFlags),
         rewriter.getAttr<omp::VariableCaptureKindAttr>(
             omp::VariableCaptureKind::ByRef),
@@ -979,12 +980,12 @@ static void reloadCacheAndRecompute(
     // If the original value is a pointer or reference, load and convert if
     // necessary.
     if (isPtr(original.getType())) {
-      restored = rewriter.create<LLVM::LoadOp>(loc, llvmPtrTy, newArg);
+      restored = LLVM::LoadOp::create(rewriter, loc, llvmPtrTy, newArg);
       if (!isa<LLVM::LLVMPointerType>(original.getType()))
         restored =
-            rewriter.create<fir::ConvertOp>(loc, original.getType(), restored);
+            fir::ConvertOp::create(rewriter, loc, original.getType(), restored);
     } else {
-      restored = rewriter.create<fir::LoadOp>(loc, newArg);
+      restored = fir::LoadOp::create(rewriter, loc, newArg);
     }
     irMapping.map(original, restored);
   }
@@ -1053,7 +1054,7 @@ static mlir::LLVM::ConstantOp
 genI32Constant(mlir::Location loc, mlir::RewriterBase &rewriter, int value) {
   mlir::Type i32Ty = rewriter.getI32Type();
   mlir::IntegerAttr attr = rewriter.getI32IntegerAttr(value);
-  return rewriter.create<mlir::LLVM::ConstantOp>(loc, i32Ty, attr);
+  return mlir::LLVM::ConstantOp::create(rewriter, loc, i32Ty, attr);
 }
 
 /// Given a box descriptor, extract the base address of the data it describes.
@@ -1230,8 +1231,8 @@ static void genFortranAssignOmpReplacement(fir::FirOpBuilder &builder,
       genOmpGetMappedPtrIfPresent(builder, loc, destBase, device, module);
   Value srcPtr =
       genOmpGetMappedPtrIfPresent(builder, loc, srcBase, device, module);
-  Value zero = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
-                                                builder.getI64IntegerAttr(0));
+  Value zero = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
+                                        builder.getI64IntegerAttr(0));
 
   // Generate the call to omp_target_memcpy to perform the data copy on the
   // device.
@@ -1348,23 +1349,24 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter,
   for (Operation *op : opsToReplace) {
     if (auto allocOp = dyn_cast<fir::AllocMemOp>(op)) {
       rewriter.setInsertionPoint(allocOp);
-      auto ompAllocmemOp = rewriter.create<omp::TargetAllocMemOp>(
-          allocOp.getLoc(), rewriter.getI64Type(), device,
+      auto ompAllocmemOp = omp::TargetAllocMemOp::create(
+          rewriter, allocOp.getLoc(), rewriter.getI64Type(), device,
           allocOp.getInTypeAttr(), allocOp.getUniqNameAttr(),
           allocOp.getBindcNameAttr(), allocOp.getTypeparams(),
           allocOp.getShape());
-      auto firConvertOp = rewriter.create<fir::ConvertOp>(
-          allocOp.getLoc(), allocOp.getResult().getType(),
-          ompAllocmemOp.getResult());
+      auto firConvertOp = fir::ConvertOp::create(rewriter, allocOp.getLoc(),
+                                                 allocOp.getResult().getType(),
+                                                 ompAllocmemOp.getResult());
       rewriter.replaceOp(allocOp, firConvertOp.getResult());
     }
     // Replace fir.freemem with omp.target_freemem.
     else if (auto freeOp = dyn_cast<fir::FreeMemOp>(op)) {
       rewriter.setInsertionPoint(freeOp);
-      auto firConvertOp = rewriter.create<fir::ConvertOp>(
-          freeOp.getLoc(), rewriter.getI64Type(), freeOp.getHeapref());
-      rewriter.create<omp::TargetFreeMemOp>(freeOp.getLoc(), device,
-                                            firConvertOp.getResult());
+      auto firConvertOp =
+          fir::ConvertOp::create(rewriter, freeOp.getLoc(),
+                                 rewriter.getI64Type(), freeOp.getHeapref());
+      omp::TargetFreeMemOp::create(rewriter, freeOp.getLoc(), device,
+                                   firConvertOp.getResult());
       rewriter.eraseOp(freeOp);
     }
     // fir.declare changes its type when hoisting it out of omp.target to
@@ -1376,8 +1378,9 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter,
           dyn_cast<fir::ReferenceType>(clonedInType);
       Type clonedEleTy = clonedRefType.getElementType();
       rewriter.setInsertionPoint(op);
-      Value loadedValue = rewriter.create<fir::LoadOp>(
-          clonedDeclareOp.getLoc(), clonedEleTy, clonedDeclareOp.getMemref());
+      Value loadedValue =
+          fir::LoadOp::create(rewriter, clonedDeclareOp.getLoc(), clonedEleTy,
+                              clonedDeclareOp.getMemref());
       clonedDeclareOp.getResult().replaceAllUsesWith(loadedValue);
     }
     // Replace runtime calls with omp versions.
@@ -1473,8 +1476,8 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector<Value> &preMapOperands,
   auto *targetBlock = &targetOp.getRegion().front();
   SmallVector<Value> preHostEvalVars{targetOp.getHostEvalVars()};
   // update the hostEvalVars of preTargetOp
-  omp::TargetOp preTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp preTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), preHostEvalVars,
@@ -1513,13 +1516,13 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector<Value> &preMapOperands,
     // Create the store operation.
     if (isPtr(originalResult.getType())) {
       if (!isa<LLVM::LLVMPointerType>(toStore.getType()))
-        toStore = rewriter.create<fir::ConvertOp>(loc, llvmPtrTy, toStore);
-      rewriter.create<LLVM::StoreOp>(loc, toStore, newArg);
+        toStore = fir::ConvertOp::create(rewriter, loc, llvmPtrTy, toStore);
+      LLVM::StoreOp::create(rewriter, loc, toStore, newArg);
     } else {
-      rewriter.create<fir::StoreOp>(loc, toStore, newArg);
+      fir::StoreOp::create(rewriter, loc, toStore, newArg);
     }
   }
-  rewriter.create<omp::TerminatorOp>(loc);
+  omp::TerminatorOp::create(rewriter, loc);
 
   // Update hostEvalVars with the mapped values for the loop bounds if we have
   // a loopNestOp and we are not generating code for the target device.
@@ -1563,8 +1566,8 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector<Value> &postMapOperands,
                                 hostEvalVars.steps.end());
   }
   // Create the isolated target op
-  omp::TargetOp isolatedTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp isolatedTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
@@ -1590,7 +1593,7 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector<Value> &postMapOperands,
 
   // Clone the original operations.
   rewriter.clone(*splitBeforeOp, isolatedMapping);
-  rewriter.create<omp::TerminatorOp>(loc);
+  omp::TerminatorOp::create(rewriter, loc);
 
   // update the loop bounds in the isolatedTargetOp if we have host_eval vars
   // and we are not generating code for the target device.
@@ -1643,8 +1646,8 @@ static omp::TargetOp genPostTargetOp(omp::TargetOp targetOp,
   auto *targetBlock = &targetOp.getRegion().front();
   SmallVector<Value> postHostEvalVars{targetOp.getHostEvalVars()};
   // Create the post target op
-  omp::TargetOp postTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp postTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), postHostEvalVars,
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 566e88b..bd07d7f 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -883,18 +883,16 @@ class MapInfoFinalizationPass
     if (explicitMappingPresent(op, targetDataOp))
       return;
 
-    mlir::omp::MapInfoOp newDescParentMapOp =
-        builder.create<mlir::omp::MapInfoOp>(
-            op->getLoc(), op.getResult().getType(), op.getVarPtr(),
-            op.getVarTypeAttr(),
-            builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
-                mlir::omp::ClauseMapFlags::to |
-                mlir::omp::ClauseMapFlags::always),
-            op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
-            mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
-            /*bounds=*/mlir::SmallVector<mlir::Value>{},
-            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
-            /*partial_map=*/builder.getBoolAttr(false));
+    mlir::omp::MapInfoOp newDescParentMapOp = mlir::omp::MapInfoOp::create(
+        builder, op->getLoc(), op.getResult().getType(), op.getVarPtr(),
+        op.getVarTypeAttr(),
+        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
+            mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::always),
+        op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
+        mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
+        /*bounds=*/mlir::SmallVector<mlir::Value>{},
+        /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
 
     targetDataOp.getMapVarsMutable().append({newDescParentMapOp});
   }
@@ -946,14 +944,13 @@ class MapInfoFinalizationPass
     // need to see how well this alteration works.
     auto loadBaseAddr =
         builder.loadIfRef(op->getLoc(), baseAddr.getVarPtrPtr());
-    mlir::omp::MapInfoOp newBaseAddrMapOp =
-        builder.create<mlir::omp::MapInfoOp>(
-            op->getLoc(), loadBaseAddr.getType(), loadBaseAddr,
-            baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(),
-            baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members,
-            membersAttr, baseAddr.getBounds(),
-            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
-            /*partial_map=*/builder.getBoolAttr(false));
+    mlir::omp::MapInfoOp newBaseAddrMapOp = mlir::omp::MapInfoOp::create(
+        builder, op->getLoc(), loadBaseAddr.getType(), loadBaseAddr,
+        baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(),
+        baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members, membersAttr,
+        baseAddr.getBounds(),
+        /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
     op.replaceAllUsesWith(newBaseAddrMapOp.getResult());
     op->erase();
     baseAddr.erase();
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 6dae39b..103e736 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -426,6 +426,12 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
 
   // Add codegen pass pipeline.
   fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename);
+
+  // Run a pass to prepare for translation of delayed privatization in the
+  // context of deferred target tasks.
+  addPassConditionally(pm, disableFirToLlvmIr, [&]() {
+    return mlir::omp::createPrepareForOMPOffloadPrivatizationPass();
+  });
 }
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/Support/Utils.cpp b/flang/lib/Optimizer/Support/Utils.cpp
index 92390e4a..2f33d89 100644
--- a/flang/lib/Optimizer/Support/Utils.cpp
+++ b/flang/lib/Optimizer/Support/Utils.cpp
@@ -66,7 +66,7 @@ fir::genConstantIndex(mlir::Location loc, mlir::Type ity,
                       mlir::ConversionPatternRewriter &rewriter,
                       std::int64_t offset) {
   auto cattr = rewriter.getI64IntegerAttr(offset);
-  return rewriter.create<mlir::LLVM::ConstantOp>(loc, ity, cattr);
+  return mlir::LLVM::ConstantOp::create(rewriter, loc, ity, cattr);
 }
 
 mlir::Value
@@ -125,9 +125,9 @@ mlir::Value fir::integerCast(const fir::LLVMTypeConverter &converter,
       return rewriter.createOrFold<mlir::LLVM::SExtOp>(loc, ty, val);
   } else {
     if (toSize < fromSize)
-      return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
+      return mlir::LLVM::TruncOp::create(rewriter, loc, ty, val);
     if (toSize > fromSize)
-      return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
+      return mlir::LLVM::SExtOp::create(rewriter, loc, ty, val);
   }
   return val;
 }
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index ed9a2ae..5bf783d 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -832,8 +832,8 @@ static mlir::Type getEleTy(mlir::Type ty) {
 static bool isAssumedSize(llvm::SmallVectorImpl<mlir::Value> &extents) {
   if (extents.empty())
     return false;
-  auto cstLen = fir::getIntIfConstant(extents.back());
-  return cstLen.has_value() && *cstLen == -1;
+  return llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(
+      extents.back().getDefiningOp());
 }
 
 // Extract extents from the ShapeOp/ShapeShiftOp into the result vector.
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 3d2db6a..caa9bdd 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -131,6 +131,9 @@ struct FindHostArray
     return (*this)(x.base());
   }
   Result operator()(const Symbol &symbol) const {
+    if (symbol.IsFuncResult()) {
+      return nullptr;
+    }
     if (const auto *details{
             symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()}) {
       if (details->IsArray() &&
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index be10669..4141630 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -61,6 +61,124 @@ namespace Fortran::semantics {
 using namespace Fortran::semantics::omp;
 using namespace Fortran::parser::omp;
 
+OmpStructureChecker::OmpStructureChecker(SemanticsContext &context)
+    : DirectiveStructureChecker(context,
+#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP
+#include "llvm/Frontend/OpenMP/OMP.inc"
+      ) {
+  scopeStack_.push_back(&context.globalScope());
+}
+
+bool OmpStructureChecker::Enter(const parser::MainProgram &x) {
+  using StatementProgramStmt = parser::Statement<parser::ProgramStmt>;
+  if (auto &stmt{std::get<std::optional<StatementProgramStmt>>(x.t)}) {
+    scopeStack_.push_back(stmt->statement.v.symbol->scope());
+  } else {
+    for (const Scope &scope : context_.globalScope().children()) {
+      // There can only be one main program.
+      if (scope.kind() == Scope::Kind::MainProgram) {
+        scopeStack_.push_back(&scope);
+        break;
+      }
+    }
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::MainProgram &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::BlockData &x) {
+  // The BLOCK DATA name is optional, so we need to look for the
+  // corresponding scope in the global scope.
+  auto &stmt{std::get<parser::Statement<parser::BlockDataStmt>>(x.t)};
+  if (auto &name{stmt.statement.v}) {
+    scopeStack_.push_back(name->symbol->scope());
+  } else {
+    for (const Scope &scope : context_.globalScope().children()) {
+      if (scope.kind() == Scope::Kind::BlockData) {
+        if (scope.symbol()->name().empty()) {
+          scopeStack_.push_back(&scope);
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::BlockData &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::Module &x) {
+  auto &stmt{std::get<parser::Statement<parser::ModuleStmt>>(x.t)};
+  const Symbol *sym{stmt.statement.v.symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::Module &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::Submodule &x) {
+  auto &stmt{std::get<parser::Statement<parser::SubmoduleStmt>>(x.t)};
+  const Symbol *sym{std::get<parser::Name>(stmt.statement.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::Submodule &x) {
+  scopeStack_.pop_back();
+}
+
+// Function/subroutine subprogram nodes don't appear in INTERFACEs, but
+// the subprogram/end statements do.
+bool OmpStructureChecker::Enter(const parser::SubroutineStmt &x) {
+  const Symbol *sym{std::get<parser::Name>(x.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::EndSubroutineStmt &x) {
+  scopeStack_.pop_back();
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::FunctionStmt &x) {
+  const Symbol *sym{std::get<parser::Name>(x.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::EndFunctionStmt &x) {
+  scopeStack_.pop_back();
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::BlockConstruct &x) {
+  auto &specPart{std::get<parser::BlockSpecificationPart>(x.t)};
+  auto &execPart{std::get<parser::Block>(x.t)};
+  if (auto &&source{parser::GetSource(specPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  } else if (auto &&source{parser::GetSource(execPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::BlockConstruct &x) {
+  auto &specPart{std::get<parser::BlockSpecificationPart>(x.t)};
+  auto &execPart{std::get<parser::Block>(x.t)};
+  if (auto &&source{parser::GetSource(specPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  } else if (auto &&source{parser::GetSource(execPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  }
+}
+
 // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
 #define CHECK_SIMPLE_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
@@ -362,6 +480,36 @@ bool OmpStructureChecker::IsNestedInDirective(llvm::omp::Directive directive) {
   return false;
 }
 
+bool OmpStructureChecker::InTargetRegion() {
+  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target)) {
+    // Return true even for device_type(host).
+    return true;
+  }
+  for (const Scope *scope : llvm::reverse(scopeStack_)) {
+    if (const auto *symbol{scope->symbol()}) {
+      if (symbol->test(Symbol::Flag::OmpDeclareTarget)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool OmpStructureChecker::HasRequires(llvm::omp::Clause req) {
+  const Scope &unit{GetProgramUnit(*scopeStack_.back())};
+  return common::visit(
+      [&](const auto &details) {
+        if constexpr (std::is_convertible_v<decltype(details),
+                          const WithOmpDeclarative &>) {
+          if (auto *reqs{details.ompRequires()}) {
+            return reqs->test(req);
+          }
+        }
+        return false;
+      },
+      DEREF(unit.symbol()).details());
+}
+
 void OmpStructureChecker::CheckVariableListItem(
     const SymbolSourceMap &symbols) {
   for (auto &[symbol, source] : symbols) {
@@ -1562,40 +1710,95 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
-  isPredefinedAllocator = true;
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
-  SymbolSourceMap currSymbols;
-  GetSymbolsInObjectList(objectList, currSymbols);
-  for (auto &[symbol, source] : currSymbols) {
-    if (IsPointer(*symbol)) {
-      context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not have POINTER "
-          "attribute"_err_en_US,
-          source.ToString());
+void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
+    const parser::OmpObjectList &objects,
+    const parser::OmpClauseList &clauses) {
+  const Scope &thisScope{context_.FindScope(source)};
+  SymbolSourceMap symbols;
+  GetSymbolsInObjectList(objects, symbols);
+
+  auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) {
+    // Return "true" if the ALLOCATOR clause was provided with an argument
+    // that is either a prefdefined allocator, or a run-time value.
+    // Otherwise return "false".
+    if (!calloc) {
+      return false;
     }
-    if (IsDummy(*symbol)) {
+    auto *allocator{std::get_if<parser::OmpClause::Allocator>(&calloc->u)};
+    if (auto val{ToInt64(GetEvaluateExpr(DEREF(allocator).v))}) {
+      // Predefined allocators (defined in OpenMP 6.0 20.8.1):
+      //   omp_null_allocator = 0,
+      //   omp_default_mem_alloc = 1,
+      //   omp_large_cap_mem_alloc = 2,
+      //   omp_const_mem_alloc = 3,
+      //   omp_high_bw_mem_alloc = 4,
+      //   omp_low_lat_mem_alloc = 5,
+      //   omp_cgroup_mem_alloc = 6,
+      //   omp_pteam_mem_alloc = 7,
+      //   omp_thread_mem_alloc = 8
+      return *val >= 0 && *val <= 8;
+    }
+    return true;
+  }};
+
+  const auto *allocator{FindClause(llvm::omp::Clause::OMPC_allocator)};
+  if (InTargetRegion()) {
+    bool hasDynAllocators{
+        HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)};
+    if (!allocator && !hasDynAllocators) {
       context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not be a dummy "
-          "argument"_err_en_US,
-          source.ToString());
+          "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US);
+    }
+  }
+
+  auto maybePredefined{maybeHasPredefinedAllocator(allocator)};
+
+  for (auto &[symbol, source] : symbols) {
+    if (!inExecutableAllocate_) {
+      if (symbol->owner() != thisScope) {
+        context_.Say(source,
+            "A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears"_err_en_US);
+      }
+      if (IsPointer(*symbol) || IsAllocatable(*symbol)) {
+        context_.Say(source,
+            "A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute"_err_en_US);
+      }
     }
     if (symbol->GetUltimate().has<AssocEntityDetails>()) {
       context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not be an associate "
-          "name"_err_en_US,
-          source.ToString());
+          "A list item in a declarative ALLOCATE cannot be an associate name"_err_en_US);
+    }
+    if (symbol->attrs().test(Attr::SAVE) || IsCommonBlock(*symbol)) {
+      if (!allocator) {
+        context_.Say(source,
+            "If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US);
+      } else if (!maybePredefined) {
+        context_.Say(source,
+            "If a list item is a named common block or has SAVE attribute, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US);
+      }
+    }
+    if (FindCommonBlockContaining(*symbol)) {
+      context_.Say(source,
+          "A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block"_err_en_US);
     }
   }
-  CheckVarIsNotPartOfAnotherVar(dir.source, objectList);
+  CheckVarIsNotPartOfAnotherVar(source, objects);
 }
 
-void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
   const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
-  CheckPredefinedAllocatorRestriction(dir.source, objectList);
+  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+}
+
+void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+  if (!inExecutableAllocate_) {
+    const auto &dir{std::get<parser::Verbatim>(x.t)};
+    const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+    const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
+
+    isPredefinedAllocator = true;
+    CheckAllocateDirective(dir.source, objectList, clauseList);
+  }
   dirContext_.pop_back();
 }
 
@@ -1951,6 +2154,7 @@ void OmpStructureChecker::CheckNameInAllocateStmt(
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
+  inExecutableAllocate_ = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
 
@@ -1960,24 +2164,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
         "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
   }
 
-  bool hasAllocator = false;
-  // TODO: Investigate whether searching the clause list can be done with
-  // parser::Unwrap instead of the following loop
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
-    if (std::get_if<parser::OmpClause::Allocator>(&clause.u)) {
-      hasAllocator = true;
-    }
-  }
-
-  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) {
-    // TODO: expand this check to exclude the case when a requires
-    //       directive with the dynamic_allocators clause is present
-    //       in the same compilation unit (OMP5.0 2.11.3).
-    context_.Say(x.source,
-        "ALLOCATE directives that appear in a TARGET region must specify an allocator clause"_err_en_US);
-  }
-
   const auto &allocateStmt =
       std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
   if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
@@ -1994,18 +2180,34 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
   }
 
   isPredefinedAllocator = true;
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (objectList) {
-    CheckVarIsNotPartOfAnotherVar(dir.source, *objectList);
-  }
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (objectList)
-    CheckPredefinedAllocatorRestriction(dir.source, *objectList);
+  parser::OmpObjectList empty{std::list<parser::OmpObject>{}};
+  auto &objects{[&]() -> const parser::OmpObjectList & {
+    if (auto &objects{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
+      return *objects;
+    } else {
+      return empty;
+    }
+  }()};
+  auto &clauses{std::get<parser::OmpClauseList>(x.t)};
+  CheckAllocateDirective(
+      std::get<parser::Verbatim>(x.t).source, objects, clauses);
+
+  if (const auto &subDirs{
+          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
+              x.t)}) {
+    for (const auto &dalloc : *subDirs) {
+      const auto &dir{std::get<parser::Verbatim>(x.t)};
+      const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)};
+      const auto &objects{std::get<parser::OmpObjectList>(dalloc.t)};
+      CheckAllocateDirective(dir.source, objects, clauses);
+    }
+  }
+
   dirContext_.pop_back();
+  inExecutableAllocate_ = false;
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index b3fd6c8..7426559 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -56,21 +56,32 @@ using SymbolSourceMap = std::multimap<const Symbol *, parser::CharBlock>;
 using DirectivesClauseTriple = std::multimap<llvm::omp::Directive,
     std::pair<llvm::omp::Directive, const OmpClauseSet>>;
 
-class OmpStructureChecker
-    : public DirectiveStructureChecker<llvm::omp::Directive, llvm::omp::Clause,
-          parser::OmpClause, llvm::omp::Clause_enumSize> {
+using OmpStructureCheckerBase = DirectiveStructureChecker<llvm::omp::Directive,
+    llvm::omp::Clause, parser::OmpClause, llvm::omp::Clause_enumSize>;
+
+class OmpStructureChecker : public OmpStructureCheckerBase {
 public:
-  using Base = DirectiveStructureChecker<llvm::omp::Directive,
-      llvm::omp::Clause, parser::OmpClause, llvm::omp::Clause_enumSize>;
+  using Base = OmpStructureCheckerBase;
+
+  OmpStructureChecker(SemanticsContext &context);
 
-  OmpStructureChecker(SemanticsContext &context)
-      : DirectiveStructureChecker(context,
-#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP
-#include "llvm/Frontend/OpenMP/OMP.inc"
-        ) {
-  }
   using llvmOmpClause = const llvm::omp::Clause;
 
+  bool Enter(const parser::MainProgram &);
+  void Leave(const parser::MainProgram &);
+  bool Enter(const parser::BlockData &);
+  void Leave(const parser::BlockData &);
+  bool Enter(const parser::Module &);
+  void Leave(const parser::Module &);
+  bool Enter(const parser::Submodule &);
+  void Leave(const parser::Submodule &);
+  bool Enter(const parser::SubroutineStmt &);
+  bool Enter(const parser::EndSubroutineStmt &);
+  bool Enter(const parser::FunctionStmt &);
+  bool Enter(const parser::EndFunctionStmt &);
+  bool Enter(const parser::BlockConstruct &);
+  void Leave(const parser::BlockConstruct &);
+
   void Enter(const parser::OpenMPConstruct &);
   void Leave(const parser::OpenMPConstruct &);
   void Enter(const parser::OpenMPInteropConstruct &);
@@ -177,10 +188,12 @@ private:
       const parser::CharBlock &, const OmpDirectiveSet &);
   bool IsCloselyNestedRegion(const OmpDirectiveSet &set);
   bool IsNestedInDirective(llvm::omp::Directive directive);
+  bool InTargetRegion();
   void HasInvalidTeamsNesting(
       const llvm::omp::Directive &dir, const parser::CharBlock &source);
   void HasInvalidDistributeNesting(const parser::OpenMPLoopConstruct &x);
   void HasInvalidLoopBinding(const parser::OpenMPLoopConstruct &x);
+  bool HasRequires(llvm::omp::Clause req);
   // specific clause related
   void CheckAllowedMapTypes(
       parser::OmpMapType::Value, llvm::ArrayRef<parser::OmpMapType::Value>);
@@ -250,6 +263,9 @@ private:
   bool CheckTargetBlockOnlyTeams(const parser::Block &);
   void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock);
   void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock);
+  void CheckAllocateDirective(parser::CharBlock source,
+      const parser::OmpObjectList &objects,
+      const parser::OmpClauseList &clauses);
 
   void CheckIteratorRange(const parser::OmpIteratorSpecifier &x);
   void CheckIteratorModifier(const parser::OmpIterator &x);
@@ -367,12 +383,15 @@ private:
   };
   int directiveNest_[LastType + 1] = {0};
 
+  bool inExecutableAllocate_{false};
   parser::CharBlock visitedAtomicSource_;
   SymbolSourceMap deferredNonVariables_;
 
   using LoopConstruct = std::variant<const parser::DoConstruct *,
       const parser::OpenMPLoopConstruct *>;
   std::vector<LoopConstruct> loopStack_;
+  // Scopes for scoping units.
+  std::vector<const Scope *> scopeStack_;
 };
 
 /// Find a duplicate entry in the range, and return an iterator to it.
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 292e73b..cc55bb4 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -218,7 +218,7 @@ bool IsMapExitingType(parser::OmpMapType::Value type) {
   }
 }
 
-std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr) {
+MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr) {
   const parser::TypedExpr &typedExpr{parserExpr.typedExpr};
   // ForwardOwningPointer           typedExpr
   // `- GenericExprWrapper          ^.get()
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index c410bd4..196755e 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -3094,26 +3094,6 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
         AddAllocateName(name);
       }
     }
-    if (ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective &&
-        IsAllocatable(*symbol) &&
-        !IsNestedInDirective(llvm::omp::Directive::OMPD_allocate)) {
-      context_.Say(designator.source,
-          "List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement"_err_en_US);
-    }
-    bool checkScope{ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective};
-    // In 5.1 the scope check only applies to declarative allocate.
-    if (version == 50 && !checkScope) {
-      checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective;
-    }
-    if (checkScope) {
-      if (omp::GetScopingUnit(GetContext().scope) !=
-          omp::GetScopingUnit(symbol->GetUltimate().owner())) {
-        context_.Say(designator.source, // 2.15.3
-            "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
-            parser::ToUpperCaseLetters(
-                llvm::omp::getOpenMPDirectiveName(directive, version)));
-      }
-    }
     if (ompFlag == Symbol::Flag::OmpReduction) {
       // Using variables inside of a namelist in OpenMP reductions
       // is allowed by the standard, but is not allowed for
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index 15a42c3..c2036c4 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -112,7 +112,7 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
   mlir::Block *entryBlock = &region.getBlocks().front();
   firOpBuilder.setInsertionPointToStart(entryBlock);
   auto loadOp =
-      firOpBuilder.create<fir::LoadOp>(clonedValArg.getLoc(), clonedValArg);
+      fir::LoadOp::create(firOpBuilder, clonedValArg.getLoc(), clonedValArg);
   return loadOp.getResult();
 }
 
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 22df9cd..5182950 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -21,23 +21,32 @@ implicit none
     procedure :: syncthreads
   end interface
 
-  interface
-    attributes(device) integer function syncthreads_and(value)
-      integer, value :: value
+  interface syncthreads_and
+    attributes(device) integer function syncthreads_and_i4(value)
+      integer(4), value :: value
     end function
-  end interface
+    attributes(device) integer function syncthreads_and_l4(value)
+      logical(4), value :: value
+    end function
+  end interface syncthreads_and
 
-  interface
-    attributes(device) integer function syncthreads_count(value)
-      integer, value :: value
+  interface syncthreads_count
+    attributes(device) integer function syncthreads_count_i4(value)
+      integer(4), value :: value
     end function
-  end interface
+    attributes(device) integer function syncthreads_count_l4(value)
+      logical(4), value :: value
+    end function
+  end interface syncthreads_count
 
-  interface
-    attributes(device) integer function syncthreads_or(value)
-      integer, value :: value
+  interface syncthreads_or
+    attributes(device) integer function syncthreads_or_i4(value)
+      integer(4), value :: value
     end function
-  end interface
+    attributes(device) integer function syncthreads_or_l4(value)
+      logical(4), value :: value
+    end function
+  end interface syncthreads_or
 
   interface
     attributes(device) subroutine syncwarp(mask)
diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
index a334934..48fee10 100644
--- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
+++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
@@ -55,3 +55,56 @@ func.func @main(%arg0: complex<f64>) {
 // CHECK-SAME: (%arg0: f64, %arg1: f64) kernel {
 // CHECK: gpu.return
 // CHECK: gpu.launch_func  @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : f64, %{{.*}} : f64) {cuf.proc_attr = #cuf.cuda_proc<global>}
+
+// -----
+
+module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  gpu.module @testmod {
+    gpu.func @_QMbarPfoo(%arg0: f32, %arg1: !fir.ref<!fir.array<100xf32>>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPfoo(
+// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref<!fir.array<100xf32>>, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WORKGROUP:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WORKGROUP]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+
+    gpu.func @_QMbarPfoo2(%arg0: f32, %arg1: !fir.ref<!fir.array<100xf32>>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}, %arg4 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      memref.store %arg0, %arg4[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPfoo2(
+// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref<!fir.array<100xf32>>, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WG1:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}, %[[WG2:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WG1]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+// CHECK: memref.store %{{.*}}, %[[WG2]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+
+    gpu.func @_QMbarPprivate(%arg0: f32, %arg1: !fir.boxchar<1>) workgroup(%arg2 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) private(%arg3 : memref<1xf32, #gpu.address_space<private>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<private>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPprivate(
+// CHECK-SAME: %{{.*}}: f32, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) private(%[[PRIVATE:.*]] : memref<1xf32, #gpu.address_space<private>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+// CHECK: memref.store %{{.*}}, %[[PRIVATE]][%{{.*}}] : memref<1xf32, #gpu.address_space<private>>
+    
+    gpu.func @test_with_char_proc(%arg0: f32, %arg1: tuple<() -> (), i64> {fir.char_proc}) workgroup(%arg2 : memref<1xf32, #gpu.address_space<workgroup>>) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @test_with_char_proc(
+// CHECK-SAME: %{{.*}}: f32, %[[CHARPROC:.*]]: () -> () {fir.char_proc}, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space<workgroup>>) {
+// CHECK: %{{.*}} = fir.undefined tuple<() -> (), i64>
+// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[CHARPROC]], [0 : index] : (tuple<() -> (), i64>, () -> ()) -> tuple<() -> (), i64>
+// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[LENGTH]], [1 : index] : (tuple<() -> (), i64>, i64) -> tuple<() -> (), i64>
+// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+  }
+}
+
diff --git a/flang/test/Fir/assumed-size-ops-codegen.fir b/flang/test/Fir/assumed-size-ops-codegen.fir
new file mode 100644
index 0000000..54e9b3c
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-codegen.fir
@@ -0,0 +1,19 @@
+// RUN: fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" %s | FileCheck %s
+
+// CHECK-LABEL: @assumed_size_extent(
+// CHECK: %[[CNEG1:.*]] = llvm.mlir.constant(-1 : i64)
+// CHECK: llvm.return %[[CNEG1]] : i64
+func.func @assumed_size_extent() -> index {
+  %e = fir.assumed_size_extent : index
+  return %e : index
+}
+
+// CHECK-LABEL:  @is_assumed_size_extent(
+// CHECK: %[[NEG1:.*]] = llvm.mlir.constant(-1 : i64)
+// CHECK: %[[CMP:.*]] = llvm.icmp "eq"
+// CHECK: llvm.return %[[CMP]] : i1
+func.func @is_assumed_size_extent(%x: index) -> i1 {
+  %c = fir.is_assumed_size_extent %x : (index) -> i1
+  return %c : i1
+}
+
diff --git a/flang/test/Fir/assumed-size-ops-folding.fir b/flang/test/Fir/assumed-size-ops-folding.fir
new file mode 100644
index 0000000..9fd5fab
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-folding.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt --canonicalize %s | FileCheck %s
+
+// Verify: fir.is_assumed_size_extent(fir.assumed_size_extent) folds to i1 true.
+
+// CHECK-LABEL: func.func @fold(
+func.func @fold() -> i1 {
+  %e = fir.assumed_size_extent : index
+  // CHECK: %[[C:.*]] = arith.constant true
+  %t = fir.is_assumed_size_extent %e : (index) -> i1
+  return %t : i1
+}
+
+
diff --git a/flang/test/Fir/assumed-size-ops-roundtrip.fir b/flang/test/Fir/assumed-size-ops-roundtrip.fir
new file mode 100644
index 0000000..c3c1883
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-roundtrip.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+
+func.func @roundtrip() {
+  // CHECK: %[[E:.*]] = fir.assumed_size_extent : index
+  %e = fir.assumed_size_extent : index
+
+  // CHECK: %[[T:.*]] = fir.is_assumed_size_extent %[[E]] : (index) -> i1
+  %t = fir.is_assumed_size_extent %e : (index) -> i1
+
+  return
+}
+
+
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 5159c91..6d2beae 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -161,4 +161,5 @@ func.func @_QQmain() {
 // PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
+// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90
index 42e9ed2..aaac98b 100644
--- a/flang/test/HLFIR/assumed-type-actual-args.f90
+++ b/flang/test/HLFIR/assumed-type-actual-args.f90
@@ -113,7 +113,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_1:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>)
 ! CHECK:           fir.call @_QPs2(%[[VAL_3]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 29c348c..55bb587 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -12,17 +12,23 @@ attributes(global) subroutine devsub()
   integer(8) :: al
   integer(8) :: time
   integer :: smalltime
-  integer(4) :: res
+  integer(4) :: res, offset
   integer(8) :: resl
 
+  integer :: tid
+  tid = threadIdx%x
+
   call syncthreads()
   call syncwarp(1)
   call threadfence()
   call threadfence_block()
   call threadfence_system()
   ret = syncthreads_and(1)
+  res = syncthreads_and(tid > offset)
   ret = syncthreads_count(1)
+  ret = syncthreads_count(tid > offset)
   ret = syncthreads_or(1)
+  ret = syncthreads_or(tid > offset)
 
   ai = atomicadd(ai, 1_4)
   al = atomicadd(al, 1_8)
@@ -100,9 +106,21 @@ end
 ! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%[[CMP]])
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%[[CMP]]) fastmath<contract> : (i1) -> i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%[[CMP]]) fastmath<contract> : (i1) -> i32
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
index 9ecbb7c..ffb36fa 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
@@ -145,7 +145,7 @@ end subroutine
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : index
 ! CHECK:           %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index
-! CHECK:           %[[VAL_6:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_6:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>>
diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90
index 0f80c72..f1f968de 100644
--- a/flang/test/Lower/HLFIR/select-rank.f90
+++ b/flang/test/Lower/HLFIR/select-rank.f90
@@ -371,7 +371,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr1(%[[VAL_11]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_12:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_12:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_13:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -435,7 +435,7 @@ end subroutine
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb4:
-! CHECK:           %[[VAL_9:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_9:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_10:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
@@ -482,7 +482,7 @@ end subroutine
 ! CHECK:           fir.call @_QPr1_implicit(%[[VAL_21]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_22:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_22:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_23:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_25:.*]] = fir.shape %[[VAL_22]] : (index) -> !fir.shape<1>
@@ -534,7 +534,7 @@ end subroutine
 ! CHECK:           fir.call @_QPrc1_implicit(%[[VAL_26]]) fastmath<contract> : (!fir.boxchar<1>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_27:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_27:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_28:.*]] = fir.box_addr %[[VAL_8]]#1 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> !fir.ref<!fir.array<*:!fir.char<1,?>>>
 ! CHECK:           %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.ref<!fir.array<*:!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_30:.*]] = fir.shape %[[VAL_27]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/Intrinsics/lbound.f90 b/flang/test/Lower/Intrinsics/lbound.f90
index a5ca2d3..75c11ff 100644
--- a/flang/test/Lower/Intrinsics/lbound.f90
+++ b/flang/test/Lower/Intrinsics/lbound.f90
@@ -40,7 +40,7 @@ end subroutine
 subroutine lbound_test_3(a, dim, res)
   real, dimension(2:10, 3:*) :: a
   integer(8):: dim, res
-! CHECK:  %[[VAL_0:.*]] = arith.constant -1 : index
+! CHECK:  %[[VAL_0:.*]] = fir.assumed_size_extent : index
 ! CHECK:  %[[VAL_1:.*]] = fir.load %arg1 : !fir.ref<i64>
 ! CHECK:  %[[VAL_2:.*]] = fir.shape_shift %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index, index) -> !fir.shapeshift<2>
 ! CHECK:         %[[VAL_3:.*]] = fir.embox %arg0(%[[VAL_2]]) : (!fir.ref<!fir.array<9x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<9x?xf32>>
diff --git a/flang/test/Lower/Intrinsics/ubound.f90 b/flang/test/Lower/Intrinsics/ubound.f90
index dae21ac..bc8cff8 100644
--- a/flang/test/Lower/Intrinsics/ubound.f90
+++ b/flang/test/Lower/Intrinsics/ubound.f90
@@ -48,7 +48,7 @@ end subroutine
 subroutine ubound_test_3(a, dim, res)
   real, dimension(10, 20, *) :: a
   integer(8):: dim, res
-! CHECK:         %[[VAL_0:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_0:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_1:.*]] = fir.shape %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index) -> !fir.shape<3>
 ! CHECK:         %[[VAL_2:.*]] = fir.embox %{{.*}}(%[[VAL_1]]) : (!fir.ref<!fir.array<10x20x?xf32>>, !fir.shape<3>) -> !fir.box<!fir.array<10x20x?xf32>>
 ! CHECK:         %[[VAL_3:.*]] = fir.load %{{.*}} : !fir.ref<i64>
diff --git a/flang/test/Lower/array-expression-assumed-size.f90 b/flang/test/Lower/array-expression-assumed-size.f90
index 2fbf315..a498148 100644
--- a/flang/test/Lower/array-expression-assumed-size.f90
+++ b/flang/test/Lower/array-expression-assumed-size.f90
@@ -19,7 +19,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_1A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_1B:.*]] = arith.cmpi sgt, %[[VAL_1A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_1:.*]] = arith.select %[[VAL_1B]], %[[VAL_1A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_2:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_2:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i64
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
@@ -82,7 +82,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_2A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_2B:.*]] = arith.cmpi sgt, %[[VAL_2A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_2:.*]] = arith.select %[[VAL_2B]], %[[VAL_2A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_3:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_3:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 2 : i32
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 6 : i32
@@ -149,7 +149,7 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 3 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 4 : index
-! PostOpt-DAG:         %[[VAL_7:.*]] = arith.constant -1 : index
+! PostOpt-DAG:         %[[VAL_7:.*]] = fir.assumed_size_extent : index
 ! PostOpt:         %[[VAL_8:.*]] = fir.shape %[[VAL_1]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_9:.*]] = fir.slice %[[VAL_2]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]], %[[VAL_3]], %[[VAL_2]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_3]]
@@ -227,8 +227,8 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 1 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 5 : index
-! PostOpt-DAG:         %[[VAL_8:.*]] = arith.constant -1 : index
 ! PostOpt:         %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
+! PostOpt:         %[[VAL_8:.*]] = fir.assumed_size_extent : index
 ! PostOpt:         %[[VAL_9:.*]] = fir.shape %[[VAL_2]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_4]]
 ! PostOpt:         br ^bb1(%[[VAL_5]], %[[VAL_4]] : index, index)
diff --git a/flang/test/Lower/entry-statement.f90 b/flang/test/Lower/entry-statement.f90
index 83d2d32..f1e535a 100644
--- a/flang/test/Lower/entry-statement.f90
+++ b/flang/test/Lower/entry-statement.f90
@@ -491,7 +491,7 @@ end subroutine
 ! CHECK-LABEL:   func.func @_QPentry_with_assumed_size(
 ! CHECK-SAME:                                          %[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_2:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           cf.br ^bb1
diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-polymorphic.f90
new file mode 100644
index 0000000..2b7a51f
--- /dev/null
+++ b/flang/test/Lower/forall-polymorphic.f90
@@ -0,0 +1,89 @@
+! Test lower of FORALL polymorphic pointer assignment 
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+
+!! Test when LHS is polymorphic and RHS is not polymorphic
+! CHECK-LABEL: c.func @_QPforallpolymorphic
+  subroutine forallPolymorphic()
+  TYPE :: DT
+    CLASS(DT), POINTER    :: Ptr(:) => NULL()
+  END TYPE
+
+  TYPE, EXTENDS(DT) :: DT1
+  END TYPE
+
+  TYPE(DT1), TARGET  :: Tar1(10)
+  CLASS(DT), POINTER :: T(:)
+  integer :: I
+
+  FORALL (I=1:10)
+    T(I)%Ptr => Tar1
+  END FORALL
+
+! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>> {bindc_name = "t", uniq_name = "_QFforallpolymorphicEt"}
+! CHECK: %[[V_15:[0-9]+]] = fir.declare %[[V_11]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFforallpolymorphicEt"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphicEtar1"}
+! CHECK: %[[V_17:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_18:[0-9]+]] = fir.declare %[[V_16]](%[[V_17]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFforallpolymorphicEtar1"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>
+! CHECK: %[[V_19:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_35:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg0 = %[[V_34]] to %[[V_35]] step %c1
+! CHECK: {
+! CHECK: %[[V_36:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_15]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_38:[0-9]+]] = fir.convert %[[V_36]] : (i32) -> i64
+! CHECK: %[[C0:.*]] = arith.constant 0 : index
+! CHECK: %[[V_39:[0-9]+]]:3 = fir.box_dims %37, %[[C0]] : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>, index) -> (index, index, index)
+! CHECK: %[[V_40:[0-9]+]] = fir.shift %[[V_39]]#0 : (index) -> !fir.shift<1>
+! CHECK: %[[V_41:[0-9]+]] = fir.array_coor %[[V_37]](%[[V_40]]) %[[V_38]] : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>, !fir.shift<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>
+! CHECK: %[[V_42:[0-9]+]] = fir.embox %[[V_41]] source_box %[[V_37]] : (!fir.ref<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>, !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>) -> !fir.class<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>
+! CHECK: %[[V_43:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>
+! CHECK: %[[V_44:[0-9]+]] = fir.coordinate_of %[[V_42]], ptr : (!fir.class<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_45:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>>
+! CHECK: %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.box<!fir.ptr<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>>) -> !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>
+! CHECK: fir.store %[[V_46]] to %[[V_44]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: }
+
+  end subroutine forallPolymorphic
+
+!! Test when LHS is not polymorphic but RHS is polymorphic
+! CHECK-LABEL: c.func @_QPforallpolymorphic2(
+! CHECK-SAME: %arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) {
+  subroutine forallPolymorphic2(Tar1)
+  TYPE :: DT
+    TYPE(DT), POINTER    :: Ptr(:) => NULL()
+  END TYPE
+
+  TYPE, EXTENDS(DT) :: DT1
+  END TYPE
+
+  CLASS(DT), ALLOCATABLE, TARGET  :: Tar1(:)
+  TYPE(DT) :: T(10)
+  integer :: I
+
+  FORALL (I=1:10)
+    T(I)%Ptr => Tar1
+  END FORALL
+
+! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>> {bindc_name = "t", uniq_name = "_QFforallpolymorphic2Et"}
+! CHECK: %[[V_12:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_13:[0-9]+]] = fir.declare %[[V_11]](%[[V_12]]) {uniq_name = "_QFforallpolymorphic2Et"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>
+! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>, !fir.dscope) -> !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_30:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_31:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg1 = %[[V_30]] to %[[V_31]] step %c1
+! CHECK: {
+! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg1 : (index) -> i32
+! CHECK: %[[V_33:[0-9]+]] = fir.convert %[[V_32]] : (i32) -> i64
+! CHECK: %[[V_34:[0-9]+]] = fir.array_coor %[[V_13]](%[[V_12]]) %[[V_33]] : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>
+! CHECK: %[[V_35:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>
+! CHECK: %[[V_36:[0-9]+]] = fir.coordinate_of %[[V_34]], ptr : (!fir.ref<!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_18]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_38:[0-9]+]]:3 = fir.box_dims %[[V_37]], %c0 : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>, index) -> (index, index, index)
+! CHECK: %[[V_39:[0-9]+]] = fir.shift %[[V_38]]#0 : (index) -> !fir.shift<1>
+! CHECK: %[[V_40:[0-9]+]] = fir.rebox %[[V_37]](%[[V_39]]) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>, !fir.shift<1>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>
+! CHECK: fir.store %[[V_40]] to %[[V_36]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: }
+
+  end subroutine forallPolymorphic2
+
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 1d99811..229fd4d 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -15,7 +15,7 @@ use omp_lib
     integer :: a, b
     real, dimension (:,:), allocatable :: darray
 
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+    !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
     !$omp allocate(y)
         print *, a
 
diff --git a/flang/test/Semantics/OpenMP/allocate04.f90 b/flang/test/Semantics/OpenMP/allocate04.f90
index bbd74eb..5fd75ba 100644
--- a/flang/test/Semantics/OpenMP/allocate04.f90
+++ b/flang/test/Semantics/OpenMP/allocate04.f90
@@ -14,16 +14,19 @@ use iso_c_binding
   type(c_ptr), pointer :: p
   integer :: x, y, z
 
-  associate (a => x)
-  !$omp allocate(x) allocator(omp_default_mem_alloc)
-
   !ERROR: PRIVATE clause is not allowed on the ALLOCATE directive
   !$omp allocate(y) private(y)
-  !ERROR: List item 'z' in ALLOCATE directive must not be a dummy argument
-  !$omp allocate(z)
-  !ERROR: List item 'p' in ALLOCATE directive must not have POINTER attribute
+  !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(p)
-  !ERROR: List item 'a' in ALLOCATE directive must not be an associate name
+
+  associate (a => x)
+  block
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
+  !$omp allocate(x) allocator(omp_default_mem_alloc)
+
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
+  !ERROR: A list item in a declarative ALLOCATE cannot be an associate name
   !$omp allocate(a)
+  end block
   end associate
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocate05.f90 b/flang/test/Semantics/OpenMP/allocate05.f90
index a787e8b..b5f7864 100644
--- a/flang/test/Semantics/OpenMP/allocate05.f90
+++ b/flang/test/Semantics/OpenMP/allocate05.f90
@@ -18,7 +18,7 @@ use omp_lib
   !$omp end target
 
   !$omp target
-      !ERROR: ALLOCATE directives that appear in a TARGET region must specify an allocator clause
+      !ERROR: An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified
       !$omp allocate
           allocate ( darray(a, b) )
   !$omp end target
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index e14134c..9b57322 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -11,7 +11,7 @@ use omp_lib
   integer :: a, b, x
   real, dimension (:,:), allocatable :: darray
 
-  !ERROR: List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement
+  !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
 
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index 5bfa918..f4f11e2 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -3,14 +3,15 @@
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
 ! OpenMP Version 5.0
 ! 2.11.3 allocate Directive
-! If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a
-! module, then only predefined memory allocator parameters can be used in the allocator clause
+! If list items within the ALLOCATE directive have the SAVE attribute, are a
+! common block name, or are declared in the scope of a module, then only
+! predefined memory allocator parameters can be used in the allocator clause
 
 module AllocateModule
   INTEGER :: z
 end module
 
-subroutine allocate()
+subroutine allocate(custom_allocator)
 use omp_lib
 use AllocateModule
   integer, SAVE :: x
@@ -18,30 +19,25 @@ use AllocateModule
   COMMON /CommonName/ y
 
   integer(kind=omp_allocator_handle_kind) :: custom_allocator
-  integer(kind=omp_memspace_handle_kind) :: memspace
-  type(omp_alloctrait), dimension(1) :: trait
-  memspace = omp_default_mem_space
-  trait(1)%key = fallback
-  trait(1)%value = default_mem_fb
-  custom_allocator = omp_init_allocator(memspace, 1, trait)
 
   !$omp allocate(x) allocator(omp_default_mem_alloc)
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y) allocator(omp_default_mem_alloc)
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z) allocator(omp_default_mem_alloc)
 
+  !ERROR: If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator
   !$omp allocate(x)
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y)
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z)
 
   !$omp allocate(w) allocator(custom_allocator)
 
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
   !$omp allocate(x) allocator(custom_allocator)
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y) allocator(custom_allocator)
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z) allocator(custom_allocator)
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
index c71c7ca..212e48f 100644
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ b/flang/test/Semantics/OpenMP/allocators04.f90
@@ -22,12 +22,10 @@ subroutine allocate()
     trait(1)%value = default_mem_fb
     custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait)
 
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a)
 
     !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(custom_allocator: b)
         allocate(b)
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/allocators06.f90 b/flang/test/Semantics/OpenMP/allocators06.f90
deleted file mode 100644
index 8e63512..0000000
--- a/flang/test/Semantics/OpenMP/allocators06.f90
+++ /dev/null
@@ -1,18 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
-! OpenMP Version 5.2
-! Inherited from 2.11.3 allocate directive
-! The allocate directive must appear in the same scope as the declarations of
-! each of its list items and must follow all such declarations.
-
-subroutine allocate()
-    use omp_lib
-    integer, allocatable :: a
-contains
-    subroutine test()
-        !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
-        !$omp allocators allocate(omp_default_mem_alloc: a)
-            allocate(a)
-    end subroutine
-end subroutine
diff --git a/flang/test/Semantics/OpenMP/declarative-directive02.f90 b/flang/test/Semantics/OpenMP/declarative-directive02.f90
index dcde963..04b8c3d 100644
--- a/flang/test/Semantics/OpenMP/declarative-directive02.f90
+++ b/flang/test/Semantics/OpenMP/declarative-directive02.f90
@@ -9,7 +9,7 @@ subroutine test_decl
   implicit none
   save :: x1, y1
   !$omp threadprivate(x1)
-  !$omp allocate(y1)
+  !$omp allocate(y1) allocator(0)
   integer :: x1, y1
 
   ! OMPv5.2 7.7 declare-simd
@@ -33,12 +33,12 @@ end subroutine
 subroutine test_decl2
   save x1, y1
   !$omp threadprivate(x1)
-  !$omp allocate(y1)
+  !$omp allocate(y1) allocator(0)
   integer :: x1, y1
 
   ! implicit decl
   !$omp threadprivate(x2)
-  !$omp allocate(y2)
+  !$omp allocate(y2) allocator(0)
   save x2, y2
 end subroutine
 
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index 9178b0a..df6568d 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -36,6 +36,12 @@ module m
     if (i .le. N) a(i) = m(i)
   end subroutine
 
+  attributes(device) function devfct(r1, r2) result(res)
+    real(4), intent(in) :: r1(3), r2(3)
+    real(4) :: res(3)
+    res = r1 - r2 ! Do not error on function result
+  end function
+
   attributes(global) subroutine hostparameter(a)
     integer :: a(*)
     i = threadIdx%x
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index 32b0a1d..67d07ee 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -50,6 +50,7 @@ int main(int argc, char **argv) {
 #endif
   DialectRegistry registry;
   fir::support::registerDialects(registry);
+  registry.insert<mlir::memref::MemRefDialect>();
   fir::support::addFIRExtensions(registry);
   return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n",
       registry));