/** * Contains various string related functions. * * Copyright: Copyright (C) 1999-2025 by The D Language Foundation, All Rights Reserved * Authors: Walter Bright, https://www.digitalmars.com * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/compiler/src/dmd/root/string.d, root/_string.d) * Documentation: https://dlang.org/phobos/dmd_root_string.html * Coverage: https://codecov.io/gh/dlang/dmd/src/master/compiler/src/dmd/root/string.d */ module dmd.root.string; import core.stdc.string; import dmd.root.rmem; /// Slices a `\0`-terminated C-string, excluding the terminator inout(char)[] toDString (inout(char)* s) pure nothrow @nogc { import core.stdc.string : strlen; return s ? s[0 .. strlen(s)] : null; } private struct FTuple(T...) { T expand; } /// Returns: a (length, ptr) tuple for passing a D string to `printf`-style functions with the format string `%.*s` auto fTuple(const(char)[] str) { return FTuple!(int, const(char)*)(cast(int) str.length, str.ptr); } /// unittest { import core.stdc.stdio: snprintf; char[6] buf = '.'; const(char)[] str = "cutoff"[0..4]; snprintf(buf.ptr, buf.length, "%.*s", str.fTuple.expand); assert(buf[] == "cuto\0."); } /** Compare two slices for equality, in a case-insensitive way Comparison is based on `char` and does not do decoding. As a result, it's only really accurate for plain ASCII strings. Params: s1 = string to compare s2 = string to compare Returns: `true` if `s1 == s2` regardless of case */ extern(D) static bool iequals(const(char)[] s1, const(char)[] s2) pure nothrow @nogc { import core.stdc.ctype : toupper; if (s1.length != s2.length) return false; foreach (idx, c1; s1) { // Since we did a length check, it is safe to bypass bounds checking const c2 = s2.ptr[idx]; if (c1 != c2) if (toupper(c1) != toupper(c2)) return false; } return true; } /** Copy the content of `src` into a C-string ('\0' terminated) then call `dg` The intent of this function is to provide an allocation-less way to call a C function using a D slice. The function internally allocates a buffer if needed, but frees it on exit. Note: The argument to `dg` is `scope`. To keep the data around after `dg` exits, one has to copy it. Params: src = Slice to use to call the C function dg = Delegate to call afterwards Returns: The return value of `T` */ auto toCStringThen(alias dg)(const(char)[] src) nothrow { import dmd.root.rmem : mem; import dmd.common.smallbuffer : SmallBuffer; const len = src.length + 1; char[512] small = void; auto sb = SmallBuffer!char(len, small[]); scope ptr = sb[]; ptr[0 .. src.length] = src[]; ptr[src.length] = '\0'; return dg(ptr); } unittest { assert("Hello world".toCStringThen!((v) => v == "Hello world\0")); assert("Hello world\0".toCStringThen!((v) => v == "Hello world\0\0")); assert(null.toCStringThen!((v) => v == "\0")); } /********************************************* * Convert a D string to a C string by allocating memory, * copying it, and adding a terminating 0. * Params: * s = string to copy * Result: * 0-terminated copy of s */ char[] toCString(scope const(char)[] s) nothrow { const length = s.length; char* p = cast(char*)mem.xmalloc_noscan(length + 1); memcpy(p, s.ptr, length); p[length] = 0; return p[0 .. length]; } /** * Strips one leading line terminator of the given string. * * The following are what the Unicode standard considers as line terminators: * * | Name | D Escape Sequence | Unicode Code Point | * |---------------------|-------------------|--------------------| * | Line feed | `\n` | `U+000A` | * | Line tabulation | `\v` | `U+000B` | * | Form feed | `\f` | `U+000C` | * | Carriage return | `\r` | `U+000D` | * | Next line | | `U+0085` | * | Line separator | | `U+2028` | * | Paragraph separator | | `U+2029` | * * This function will also strip `\r\n`. */ string stripLeadingLineTerminator(string str) pure nothrow @nogc @safe { enum nextLine = "\xC2\x85"; enum lineSeparator = "\xE2\x80\xA8"; enum paragraphSeparator = "\xE2\x80\xA9"; static assert(lineSeparator.length == paragraphSeparator.length); if (str.length == 0) return str; switch (str[0]) { case '\r': { if (str.length >= 2 && str[1] == '\n') return str[2 .. $]; goto case; } case '\v', '\f', '\n': return str[1 .. $]; case nextLine[0]: { if (str.length >= 2 && str[0 .. 2] == nextLine) return str[2 .. $]; return str; } case lineSeparator[0]: { if (str.length >= lineSeparator.length) { const prefix = str[0 .. lineSeparator.length]; if (prefix == lineSeparator || prefix == paragraphSeparator) return str[lineSeparator.length .. $]; } return str; } default: return str; } } unittest { assert("".stripLeadingLineTerminator == ""); assert("foo".stripLeadingLineTerminator == "foo"); assert("\xC2foo".stripLeadingLineTerminator == "\xC2foo"); assert("\xE2foo".stripLeadingLineTerminator == "\xE2foo"); assert("\nfoo".stripLeadingLineTerminator == "foo"); assert("\vfoo".stripLeadingLineTerminator == "foo"); assert("\ffoo".stripLeadingLineTerminator == "foo"); assert("\rfoo".stripLeadingLineTerminator == "foo"); assert("\u0085foo".stripLeadingLineTerminator == "foo"); assert("\u2028foo".stripLeadingLineTerminator == "foo"); assert("\u2029foo".stripLeadingLineTerminator == "foo"); assert("\n\rfoo".stripLeadingLineTerminator == "\rfoo"); assert("\r\nfoo".stripLeadingLineTerminator == "foo"); } /** * A string comparison functions that returns the same result as strcmp * * Note: Strings are compared based on their ASCII values, no UTF-8 decoding. * * Some C functions (e.g. `qsort`) require a `int` result for comparison. * See_Also: Druntime's `core.internal.string` */ int dstrcmp()( scope const char[] s1, scope const char[] s2 ) @trusted { immutable len = s1.length <= s2.length ? s1.length : s2.length; if (__ctfe) { foreach (const u; 0 .. len) { if (s1[u] != s2[u]) return s1[u] > s2[u] ? 1 : -1; } } else { import core.stdc.string : memcmp; const ret = memcmp( s1.ptr, s2.ptr, len ); if ( ret ) return ret; } return s1.length < s2.length ? -1 : (s1.length > s2.length); } // unittest { assert(dstrcmp("Fraise", "Fraise") == 0); assert(dstrcmp("Baguette", "Croissant") < 0); assert(dstrcmp("Croissant", "Baguette") > 0); static assert(dstrcmp("Baguette", "Croissant") < 0); // UTF-8 decoding for the CT variant assert(dstrcmp("안녕하세요!", "안녕하세요!") == 0); static assert(dstrcmp("안녕하세요!", "안녕하세요!") == 0); } /** * Infers the length `N` of a string literal and coerces its type to a static * array with length `N + 1`. Returns the string with a null character appended * to the end. * * Params: * literal = string literal * * Notes: * - LDC produces quite optimal code for short strings: * - https://d.godbolt.org/z/M69Z1g * - https://gist.github.com/PetarKirov/338e4ab9292b6b2b311a3070572a07fb (backup URL) */ char[N + 1] toStaticArray(size_t N)(scope const(char)[N] literal) { char[N+1] result = void; result[0..N] = literal[0..N]; result[N] = 0; return result; } /// @safe pure nothrow @nogc unittest { auto m = "123".toStaticArray; const c = "123".toStaticArray; immutable i = "123".toStaticArray; enum e = "123".toStaticArray; assert(m == "123\0"); assert(c == "123\0"); assert(i == "123\0"); static assert(e == "123\0"); const empty = "".toStaticArray; static assert(empty.length == 1); static assert(empty[0] == '\0'); } /** * Checks if C string `p` starts with `needle`. * Params: * p = the C string to check * needle = the string to look for * Returns: * `true` if `p` starts with `needle` */ @system pure nothrow @nogc bool startsWith(scope const(char)* p, scope const(char)[] needle) in { assert(p && needle.ptr); } do { foreach (const c; needle) { assert(c); if (c != *p) return false; ++p; } return true; } ///ditto nothrow @nogc pure @safe bool startsWith(scope const(char)[] str, scope const(char)[] prefix) { if (str.length < prefix.length) return false; return str[0 .. prefix.length] == prefix; } /// @system pure nothrow @nogc unittest { const buf = "123".toStaticArray; const ptr = &buf[0]; assert(ptr.startsWith("")); assert(ptr.startsWith("1")); assert(ptr.startsWith("12")); assert(ptr.startsWith("123")); assert(!ptr.startsWith("1234")); } /********************************** * Take `text` and turn it into an InputRange that emits * slices into `text` for each line. * Params: * text = array of characters * Returns: * InputRange accessing `text` as a sequence of lines * Reference: * `std.string.splitLines()` */ auto splitLines(const char[] text) { struct Range { @safe: @nogc: nothrow: pure: private: const char[] text; size_t index; // index of start of line size_t eolIndex; // index of end of line before newline characters size_t nextIndex; // index past end of line public this(const char[] text) { this.text = text; this.index = 0; this.eolIndex = 0; this.nextIndex = 0; } public bool empty() { advance(); return index >= text.length; } public void popFront() { advance(); index = nextIndex; } public const(char)[] front() { advance(); if (index > eolIndex || index >= text.length) return ""; return text[index .. eolIndex]; } private void advance() { if (index != nextIndex) // if already advanced return; for (size_t i = index; i < text.length; ++i) { switch (text[i]) { case '\v', '\f', '\n': eolIndex = i; nextIndex = i + 1; return; case '\r': if (i + 1 < text.length && text[i + 1] == '\n') // decode "\r\n" { eolIndex = i; nextIndex = i + 2; return; } eolIndex = i; nextIndex = i + 1; return; /* Manually decode: * NEL is C2 85 */ case 0xC2: if (i + 1 < text.length && text[i + 1] == 0x85) { eolIndex = i; nextIndex = i + 2; return; } break; /* Manually decode: * lineSep is E2 80 A8 * paraSep is E2 80 A9 */ case 0xE2: if (i + 2 < text.length && text[i + 1] == 0x80 && (text[i + 2] == 0xA8 || text[i + 2] == 0xA9) ) { eolIndex = i; nextIndex = i + 3; return; } break; default: break; } } // No newline found; set indices to the end of the text eolIndex = text.length; nextIndex = text.length; } } return Range(text); } private struct FindSplit { @nogc nothrow pure @safe: const(char)[][3] elem; ref const(char)[] opIndex(size_t i) scope return { return elem[i]; } bool opCast() const scope { return elem[1].length > 0; } } /** Find a substring in a string and split the string into before and after parts. Params: str = string to look into needle = substring to find in str (must not be empty) Returns: a `FindSplit` object that casts to `true` iff `needle` was found inside `str`. In that case, `split[1]` is the needle, and `split[0]`/`split[2]` are before/after the needle. */ FindSplit findSplit(return scope const(char)[] str, scope const(char)[] needle) @safe { if (needle.length > str.length) return FindSplit([str, null, null]); foreach (i; 0 .. str.length - needle.length + 1) { if (str[i .. i+needle.length] == needle[]) return FindSplit([ str[0 .. i], str[i .. i+needle.length], str[i+needle.length .. $] ]); } return FindSplit([str, null, null]); } unittest { auto s = findSplit("a b c", "c"); assert(s[0] == "a b "); assert(s[1] == "c"); assert(s[2] == ""); auto s1 = findSplit("a b c", "b"); assert(s1[0] == "a "); assert(s1[1] == "b"); assert(s1[2] == " c"); assert(!findSplit("a b c", "d")); assert(!findSplit("", "d")); } /** Find a string inbetween two substrings Params: str = string to look into l = substring to find on the left r = substring to find on the right Returns: substring of `str` inbetween `l` and `r` */ const(char)[] findBetween(const(char)[] str, const(char)[] l, const(char)[] r) @safe { if (auto s0 = str.findSplit(l)) if (auto s1 = s0[2].findSplit(r)) return s1[0]; return null; } unittest { assert(findBetween("a b c", "a ", " c") == "b"); assert(findBetween("a b c", "a ", " d") == null); }