aboutsummaryrefslogtreecommitdiff
path: root/boehm-gc/include/cord.h
blob: 926089e86fbb18567aa48981aa68dc152f1e7b76 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
/* 
 * Copyright (c) 1993-1994 by Xerox Corporation.  All rights reserved.
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Author: Hans-J. Boehm (boehm@parc.xerox.com)
 */
/* Boehm, October 5, 1995 4:20 pm PDT */
 
/*
 * Cords are immutable character strings.  A number of operations
 * on long cords are much more efficient than their strings.h counterpart.
 * In particular, concatenation takes constant time independent of the length
 * of the arguments.  (Cords are represented as trees, with internal
 * nodes representing concatenation and leaves consisting of either C
 * strings or a functional description of the string.)
 *
 * The following are reasonable applications of cords.  They would perform
 * unacceptably if C strings were used:
 * - A compiler that produces assembly language output by repeatedly
 *   concatenating instructions onto a cord representing the output file.
 * - A text editor that converts the input file to a cord, and then
 *   performs editing operations by producing a new cord representing
 *   the file after echa character change (and keeping the old ones in an
 *   edit history)
 *
 * For optimal performance, cords should be built by
 * concatenating short sections.
 * This interface is designed for maximum compatibility with C strings.
 * ASCII NUL characters may be embedded in cords using CORD_from_fn.
 * This is handled correctly, but CORD_to_char_star will produce a string
 * with embedded NULs when given such a cord. 
 *
 * This interface is fairly big, largely for performance reasons.
 * The most basic constants and functions:
 *
 * CORD - the type of a cord;
 * CORD_EMPTY - empty cord;
 * CORD_len(cord) - length of a cord;
 * CORD_cat(cord1,cord2) - concatenation of two cords;
 * CORD_substr(cord, start, len) - substring (or subcord);
 * CORD_pos i;  CORD_FOR(i, cord) {  ... CORD_pos_fetch(i) ... } -
 *    examine each character in a cord.  CORD_pos_fetch(i) is the char.
 * CORD_fetch(int i) - Retrieve i'th character (slowly).
 * CORD_cmp(cord1, cord2) - compare two cords.
 * CORD_from_file(FILE * f) - turn a read-only file into a cord.
 * CORD_to_char_star(cord) - convert to C string.
 *   (Non-NULL C constant strings are cords.)
 * CORD_printf (etc.) - cord version of printf. Use %r for cords.
 */
# ifndef CORD_H

# define CORD_H
# include <stddef.h>
# include <stdio.h>
/* Cords have type const char *.  This is cheating quite a bit, and not	*/
/* 100% portable.  But it means that nonempty character string		*/
/* constants may be used as cords directly, provided the string is	*/
/* never modified in place.  The empty cord is represented by, and	*/
/* can be written as, 0.						*/

typedef const char * CORD;

/* An empty cord is always represented as nil 	*/
# define CORD_EMPTY 0

/* Is a nonempty cord represented as a C string? */
#define CORD_IS_STRING(s) (*(s) != '\0')

/* Concatenate two cords.  If the arguments are C strings, they may 	*/
/* not be subsequently altered.						*/
CORD CORD_cat(CORD x, CORD y);

/* Concatenate a cord and a C string with known length.  Except for the	*/
/* empty string case, this is a special case of CORD_cat.  Since the	*/
/* length is known, it can be faster.					*/
/* The string y is shared with the resulting CORD.  Hence it should	*/
/* not be altered by the caller.					*/
CORD CORD_cat_char_star(CORD x, const char * y, size_t leny);

/* Compute the length of a cord */
size_t CORD_len(CORD x);

/* Cords may be represented by functions defining the ith character */
typedef char (* CORD_fn)(size_t i, void * client_data);

/* Turn a functional description into a cord. 	*/
CORD CORD_from_fn(CORD_fn fn, void * client_data, size_t len);

/* Return the substring (subcord really) of x with length at most n,	*/
/* starting at position i.  (The initial character has position 0.)	*/
CORD CORD_substr(CORD x, size_t i, size_t n);

/* Return the argument, but rebalanced to allow more efficient   	*/
/* character retrieval, substring operations, and comparisons.		*/
/* This is useful only for cords that were built using repeated 	*/
/* concatenation.  Guarantees log time access to the result, unless	*/
/* x was obtained through a large number of repeated substring ops	*/
/* or the embedded functional descriptions take longer to evaluate.	*/
/* May reallocate significant parts of the cord.  The argument is not	*/
/* modified; only the result is balanced.				*/
CORD CORD_balance(CORD x);

/* The following traverse a cord by applying a function to each 	*/
/* character.  This is occasionally appropriate, especially where	*/
/* speed is crucial.  But, since C doesn't have nested functions,	*/
/* clients of this sort of traversal are clumsy to write.  Consider	*/
/* the functions that operate on cord positions instead.		*/

/* Function to iteratively apply to individual characters in cord.	*/
typedef int (* CORD_iter_fn)(char c, void * client_data);

/* Function to apply to substrings of a cord.  Each substring is a 	*/
/* a C character string, not a general cord.				*/
typedef int (* CORD_batched_iter_fn)(const char * s, void * client_data);
# define CORD_NO_FN ((CORD_batched_iter_fn)0)

/* Apply f1 to each character in the cord, in ascending order,		*/
/* starting at position i. If						*/
/* f2 is not CORD_NO_FN, then multiple calls to f1 may be replaced by	*/
/* a single call to f2.  The parameter f2 is provided only to allow	*/
/* some optimization by the client.  This terminates when the right	*/
/* end of this string is reached, or when f1 or f2 return != 0.  In the	*/
/* latter case CORD_iter returns != 0.  Otherwise it returns 0.		*/
/* The specified value of i must be < CORD_len(x).			*/
int CORD_iter5(CORD x, size_t i, CORD_iter_fn f1,
	       CORD_batched_iter_fn f2, void * client_data);

/* A simpler version that starts at 0, and without f2:	*/
int CORD_iter(CORD x, CORD_iter_fn f1, void * client_data);
# define CORD_iter(x, f1, cd) CORD_iter5(x, 0, f1, CORD_NO_FN, cd)

/* Similar to CORD_iter5, but end-to-beginning.	No provisions for	*/
/* CORD_batched_iter_fn.						*/
int CORD_riter4(CORD x, size_t i, CORD_iter_fn f1, void * client_data);

/* A simpler version that starts at the end:	*/
int CORD_riter(CORD x, CORD_iter_fn f1, void * client_data);

/* Functions that operate on cord positions.  The easy way to traverse	*/
/* cords.  A cord position is logically a pair consisting of a cord	*/
/* and an index into that cord.  But it is much faster to retrieve a	*/
/* charcter based on a position than on an index.  Unfortunately,	*/
/* positions are big (order of a few 100 bytes), so allocate them with	*/
/* caution.								*/
/* Things in cord_pos.h should be treated as opaque, except as		*/
/* described below.  Also note that					*/
/* CORD_pos_fetch, CORD_next and CORD_prev have both macro and function	*/
/* definitions.  The former may evaluate their argument more than once. */
# include "private/cord_pos.h"

/*
	Visible definitions from above:
	
	typedef <OPAQUE but fairly big> CORD_pos[1];
	
	* Extract the cord from a position:
	CORD CORD_pos_to_cord(CORD_pos p);
	
	* Extract the current index from a position:
	size_t CORD_pos_to_index(CORD_pos p);
	
	* Fetch the character located at the given position:
	char CORD_pos_fetch(CORD_pos p);
	
	* Initialize the position to refer to the given cord and index.
	* Note that this is the most expensive function on positions:
	void CORD_set_pos(CORD_pos p, CORD x, size_t i);
	
	* Advance the position to the next character.
	* P must be initialized and valid.
	* Invalidates p if past end:
	void CORD_next(CORD_pos p);
	
	* Move the position to the preceding character.
	* P must be initialized and valid.
	* Invalidates p if past beginning:
	void CORD_prev(CORD_pos p);
	
	* Is the position valid, i.e. inside the cord?
	int CORD_pos_valid(CORD_pos p);
*/
# define CORD_FOR(pos, cord) \
    for (CORD_set_pos(pos, cord, 0); CORD_pos_valid(pos); CORD_next(pos))

			
/* An out of memory handler to call.  May be supplied by client.	*/
/* Must not return.							*/
extern void (* CORD_oom_fn)(void);

/* Dump the representation of x to stdout in an implementation defined	*/
/* manner.  Intended for debugging only.				*/
void CORD_dump(CORD x);

/* The following could easily be implemented by the client.  They are	*/
/* provided in cordxtra.c for convenience.				*/

/* Concatenate a character to the end of a cord.	*/
CORD CORD_cat_char(CORD x, char c);

/* Concatenate n cords.	*/
CORD CORD_catn(int n, /* CORD */ ...);

/* Return the character in CORD_substr(x, i, 1)  	*/
char CORD_fetch(CORD x, size_t i);

/* Return < 0, 0, or > 0, depending on whether x < y, x = y, x > y	*/
int CORD_cmp(CORD x, CORD y);

/* A generalization that takes both starting positions for the 		*/
/* comparison, and a limit on the number of characters to be compared.	*/
int CORD_ncmp(CORD x, size_t x_start, CORD y, size_t y_start, size_t len);

/* Find the first occurrence of s in x at position start or later.	*/
/* Return the position of the first character of s in x, or		*/
/* CORD_NOT_FOUND if there is none.					*/
size_t CORD_str(CORD x, size_t start, CORD s);

/* Return a cord consisting of i copies of (possibly NUL) c.  Dangerous	*/
/* in conjunction with CORD_to_char_star.				*/
/* The resulting representation takes constant space, independent of i.	*/
CORD CORD_chars(char c, size_t i);
# define CORD_nul(i) CORD_chars('\0', (i))

/* Turn a file into cord.  The file must be seekable.  Its contents	*/
/* must remain constant.  The file may be accessed as an immediate	*/
/* result of this call and/or as a result of subsequent accesses to 	*/
/* the cord.  Short files are likely to be immediately read, but	*/
/* long files are likely to be read on demand, possibly relying on 	*/
/* stdio for buffering.							*/
/* We must have exclusive access to the descriptor f, i.e. we may	*/
/* read it at any time, and expect the file pointer to be		*/
/* where we left it.  Normally this should be invoked as		*/
/* CORD_from_file(fopen(...))						*/
/* CORD_from_file arranges to close the file descriptor when it is no	*/
/* longer needed (e.g. when the result becomes inaccessible).		*/ 
/* The file f must be such that ftell reflects the actual character	*/
/* position in the file, i.e. the number of characters that can be 	*/
/* or were read with fread.  On UNIX systems this is always true.  On	*/
/* MS Windows systems, f must be opened in binary mode.			*/
CORD CORD_from_file(FILE * f);

/* Equivalent to the above, except that the entire file will be read	*/
/* and the file pointer will be closed immediately.			*/
/* The binary mode restriction from above does not apply.		*/
CORD CORD_from_file_eager(FILE * f);

/* Equivalent to the above, except that the file will be read on demand.*/
/* The binary mode restriction applies.					*/
CORD CORD_from_file_lazy(FILE * f);

/* Turn a cord into a C string.	The result shares no structure with	*/
/* x, and is thus modifiable.						*/
char * CORD_to_char_star(CORD x);

/* Turn a C string into a CORD.  The C string is copied, and so may	*/
/* subsequently be modified.						*/
CORD CORD_from_char_star(const char *s);

/* Identical to the above, but the result may share structure with	*/
/* the argument and is thus not modifiable.				*/
const char * CORD_to_const_char_star(CORD x); 

/* Write a cord to a file, starting at the current position.  No	*/
/* trailing NULs are newlines are added.				*/
/* Returns EOF if a write error occurs, 1 otherwise.			*/
int CORD_put(CORD x, FILE * f);

/* "Not found" result for the following two functions.			*/
# define CORD_NOT_FOUND ((size_t)(-1))

/* A vague analog of strchr.  Returns the position (an integer, not	*/
/* a pointer) of the first occurrence of (char) c inside x at position 	*/
/* i or later. The value i must be < CORD_len(x).			*/
size_t CORD_chr(CORD x, size_t i, int c);

/* A vague analog of strrchr.  Returns index of the last occurrence	*/
/* of (char) c inside x at position i or earlier. The value i		*/
/* must be < CORD_len(x).						*/
size_t CORD_rchr(CORD x, size_t i, int c);


/* The following are also not primitive, but are implemented in 	*/
/* cordprnt.c.  They provide functionality similar to the ANSI C	*/
/* functions with corresponding names, but with the following		*/
/* additions and changes:						*/
/* 1. A %r conversion specification specifies a CORD argument.  Field	*/
/*    width, precision, etc. have the same semantics as for %s.		*/
/*    (Note that %c,%C, and %S were already taken.)			*/
/* 2. The format string is represented as a CORD.		        */
/* 3. CORD_sprintf and CORD_vsprintf assign the result through the 1st	*/ 	/*    argument.	Unlike their ANSI C versions, there is no need to guess	*/
/*    the correct buffer size.						*/
/* 4. Most of the conversions are implement through the native 		*/
/*    vsprintf.  Hence they are usually no faster, and 			*/
/*    idiosyncracies of the native printf are preserved.  However,	*/
/*    CORD arguments to CORD_sprintf and CORD_vsprintf are NOT copied;	*/
/*    the result shares the original structure.  This may make them	*/
/*    very efficient in some unusual applications.			*/
/*    The format string is copied.					*/
/* All functions return the number of characters generated or -1 on	*/
/* error.  This complies with the ANSI standard, but is inconsistent	*/
/* with some older implementations of sprintf.				*/

/* The implementation of these is probably less portable than the rest	*/
/* of this package.							*/

#ifndef CORD_NO_IO

#include <stdarg.h>

int CORD_sprintf(CORD * out, CORD format, ...);
int CORD_vsprintf(CORD * out, CORD format, va_list args);
int CORD_fprintf(FILE * f, CORD format, ...);
int CORD_vfprintf(FILE * f, CORD format, va_list args);
int CORD_printf(CORD format, ...);
int CORD_vprintf(CORD format, va_list args);

#endif /* CORD_NO_IO */

# endif /* CORD_H */