00001 /* Part of the culibs project, <http://www.eideticdew.org/culibs/>. 00002 * Copyright (C) 2010 Petter Urkedal <paurkedal@eideticdew.org> 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU General Public License as published by 00006 * the Free Software Foundation, either version 3 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef CUTEXT_SOURCE_H 00019 #define CUTEXT_SOURCE_H 00020 00021 #include <cutext/encoding.h> 00022 #include <cu/box.h> 00023 00024 CU_BEGIN_DECLARATIONS 00025 /** \defgroup cutext_source_h cutext/source.h: Text Source with Optional Lookahead 00026 ** @{ \ingroup cutext_mod 00027 ** 00028 ** This provides the API and some implementations of sources with optional 00029 ** lookahead, intended for scanning text. These sources form the initial 00030 ** source and stack of conversions used by the more specialized \ref 00031 ** cutext_lsource_h "lexical sources". 00032 ** 00033 ** Initial sources are provided for strings and files. These can be altered 00034 ** with the <code>cutext_source_stack_*</code> functions. In particular, 00035 ** - A source may have any encoding, or provide raw data. As long as it's 00036 ** supported by iconv, an encoded source can be transcoded with \ref 00037 ** cutext_source_stack_iconv. 00038 ** - A particular source implementation may or may not provide lookahead, 00039 ** but this can always be added with \ref cutext_source_stack_buffer. 00040 ** 00041 ** \see cutext_lsource_h 00042 ** \see cutext_sink_h 00043 **/ 00044 00045 /** \name Source API 00046 ** @{ */ 00047 00048 typedef enum { 00049 CUTEXT_SOURCE_INFO_ENCODING = 1, 00050 CUTEXT_SOURCE_INFO_TABSTOP = 3, 00051 } cutext_source_info_key_t; 00052 00053 CU_SINLINE cu_bool_t 00054 cutext_source_info_key_inherits(cutext_source_info_key_t key) 00055 { return key & 1; } 00056 00057 /** The type of the CUTEXT_SOURCE_INFO_ENCODING property. This trivial 00058 ** definition is provided to help keep type casts correct for boxing and 00059 ** unboxing operations. */ 00060 typedef char const *cutext_source_info_encoding_t; 00061 00062 /** The callbacks of a \ref cutext_source implementation. For compatibility 00063 ** with future versions, the structure should be zeroed before assignment of 00064 ** the call-backs. Alternatively, it may be declared static with C99 00065 ** initializers, as in <code>{.read = _my_read, ...}</code>. */ 00066 struct cutext_source_descriptor 00067 { 00068 size_t (*read)(cutext_source_t, void *, size_t); 00069 void const *(*look)(cutext_source_t, size_t, size_t *); 00070 void (*close)(cutext_source_t); 00071 cutext_source_t (*subsource)(cutext_source_t); 00072 cu_box_t (*info)(cutext_source_t, cutext_source_info_key_t); 00073 }; 00074 00075 /** The type of a text source. The implementation will typically embed it 00076 ** within a structure containing additional fields. */ 00077 struct cutext_source 00078 { 00079 cutext_source_descriptor_t descriptor; 00080 }; 00081 00082 CU_SINLINE cutext_source_descriptor_t 00083 cutext_source_descriptor(cutext_source_t src) 00084 { 00085 return src->descriptor; 00086 } 00087 00088 /** Initialize the base \a src of a source implementation with callbacks 00089 ** provided by \a descriptor. */ 00090 CU_SINLINE void 00091 cutext_source_init(cutext_source_t src, 00092 cutext_source_descriptor_t descriptor) 00093 { 00094 src->descriptor = descriptor; 00095 } 00096 00097 /** Read at most \a max_size bytes into \a buf, returning the actual number 00098 ** read or <code>(size_t)-1</code> on error. A return of 0 indicates the end 00099 ** of the stream, any other successful call reads at least one byte. */ 00100 CU_SINLINE size_t 00101 cutext_source_read(cutext_source_t src, void *buf, size_t max_size) 00102 { return (*src->descriptor->read)(src, buf, max_size); } 00103 00104 /** Skips over at most \a max_size bytes, returning the actual number skipped, 00105 ** or <code>(size_t)-1</code> on error. A return of 0 indicates the end of 00106 ** the stream. If a previous call to \ref cutext_source_look has returned a 00107 ** size of \a max_size or larger since the last read or skip, then this call 00108 ** succeeds with the full count. */ 00109 CU_SINLINE size_t 00110 cutext_source_skip(cutext_source_t src, size_t max_size) 00111 { return (*src->descriptor->read)(src, NULL, max_size); } 00112 00113 /** True iff \a src provides the \ref cutext_source_look method. */ 00114 CU_SINLINE cu_bool_t 00115 cutext_source_can_look(cutext_source_t src) 00116 { return src->descriptor->look != NULL; } 00117 00118 /** Request a lookahead of \a size bytes of upcoming data from \a src. If 00119 ** successful the data is returned and its actual size is assigned to 00120 ** <code>*\a size_out</code>. The actual size may be larger than \a size, and 00121 ** only on the end-of-stream or in case of error may it be smaller. 00122 ** 00123 ** This method is only provided by some source implementations, as reported by 00124 ** \ref cutext_source_can_look. If needed, \ref cutext_source_stack_buffer 00125 ** stacks a buffer onto any source to provide lookahead. */ 00126 CU_SINLINE void const * 00127 cutext_source_look(cutext_source_t src, size_t size, size_t *size_out) 00128 { 00129 return (*src->descriptor->look)(src, size, size_out); 00130 } 00131 00132 /** Close \a src, which in turn should close its subsources. */ 00133 CU_SINLINE void 00134 cutext_source_close(cutext_source_t src) 00135 { 00136 (*src->descriptor->close)(src); 00137 } 00138 00139 CU_SINLINE cu_box_t 00140 cutext_source_info(cutext_source_t src, cutext_source_info_key_t key) 00141 { return (*src->descriptor->info)(src, key); } 00142 00143 /** Assist the source implementation \a src with a subsource \a subsrc in 00144 ** providing a suitable default value for \a key. */ 00145 cu_box_t cutext_source_info_inherit(cutext_source_t src, 00146 cutext_source_info_key_t key, 00147 cutext_source_t subsrc); 00148 00149 /** The name of the character encoding of \a src, or \c NULL if unknown. */ 00150 CU_SINLINE char const * 00151 cutext_source_encoding(cutext_source_t src) 00152 { return cu_unbox_ptr(cutext_source_info_encoding_t, 00153 cutext_source_info(src, CUTEXT_SOURCE_INFO_ENCODING)); } 00154 00155 00156 /** @} 00157 ** \name Generic Callbacks 00158 ** @{ */ 00159 00160 /** Always returns 0. */ 00161 size_t cutext_source_null_read(cutext_source_t, void *, size_t); 00162 00163 /** The trivial close operation. */ 00164 void cutext_source_noop_close(cutext_source_t); 00165 00166 /** Returns \c NULL. */ 00167 cutext_source_t cutext_source_no_subsource(cutext_source_t); 00168 00169 /** Returns \a NULL for encoding, and 8 for tabstop. */ 00170 cu_box_t cutext_source_default_info(cutext_source_t, cutext_source_info_key_t); 00171 00172 00173 /** @} 00174 ** \name Source Implementations 00175 ** @{ */ 00176 00177 /** Return a source over a \a size bytes of data stored from \a data and 00178 ** considered to be encoded as \a enc. */ 00179 cutext_source_t cutext_source_new_mem(char const *enc, 00180 void const *data, size_t size); 00181 00182 /** Return a source over the 0-terminated UTF-8 string \a cstr. */ 00183 cutext_source_t cutext_source_new_cstr(char const *cstr); 00184 00185 /** Return a source over the UTF-8 string \a str. */ 00186 cutext_source_t cutext_source_new_str(cu_str_t str); 00187 00188 /** Return a source over the wide string \a wstr. */ 00189 cutext_source_t cutext_source_new_wstring(cu_wstring_t wstr); 00190 00191 /** Return a source over the contents read from \a fd encoded as \a enc. If \a 00192 ** close_fd is true, then close \a fd when \ref cutext_source_close is called 00193 ** on the returned source. */ 00194 cutext_source_t cutext_source_fdopen(char const *enc, 00195 int fd, cu_bool_t close_fd); 00196 00197 /** Return a source over the contents of the file at \a path encoded as \a 00198 ** encoding. */ 00199 cutext_source_t cutext_source_fopen(char const *encoding, char const *path); 00200 00201 /** Stack a buffer on top of \a subsrc to provide lookahead. The \ref 00202 ** cutext_source_look method is guaranteed to be available on the returned 00203 ** source. */ 00204 cutext_source_t cutext_source_stack_buffer(cutext_source_t subsrc); 00205 00206 /** Stack an iconv conversion filter on \a subsrc, encoding to \a newenc. */ 00207 cutext_source_t cutext_source_stack_iconv(char const *newenc, 00208 cutext_source_t subsrc); 00209 00210 /** @} 00211 ** \name Algorithms 00212 ** @{ */ 00213 00214 /** Drain \a src and return the number of bytes which were left. */ 00215 size_t cutext_source_count(cutext_source_t src); 00216 00217 /** Try to guess which Unicode encoding is used in \a src. This requires that 00218 ** \a src supports lookahead. */ 00219 cutext_encoding_t cutext_source_guess_encoding(cutext_source_t src); 00220 00221 /** @} 00222 ** @} */ 00223 CU_END_DECLARATIONS 00224 00225 #endif