spell_defs.h
Go to the documentation of this file.
1 #ifndef NVIM_SPELL_DEFS_H
2 #define NVIM_SPELL_DEFS_H
3 
4 #include <stdbool.h>
5 #include <stdint.h>
6 
7 #include "nvim/buffer_defs.h"
8 #include "nvim/garray.h"
9 #include "nvim/regexp_defs.h"
10 #include "nvim/types.h"
11 
12 #define MAXWLEN 254 // Assume max. word len is this many bytes.
13  // Some places assume a word length fits in a
14  // byte, thus it can't be above 255.
15 
16 // Number of regions supported.
17 #define MAXREGIONS 8
18 
19 // Type used for indexes in the word tree need to be at least 4 bytes. If int
20 // is 8 bytes we could use something smaller, but what?
21 typedef int idx_T;
22 
23 # define SPL_FNAME_TMPL "%s.%s.spl"
24 # define SPL_FNAME_ADD ".add."
25 # define SPL_FNAME_ASCII ".ascii."
26 
27 // Flags used for a word. Only the lowest byte can be used, the region byte
28 // comes above it.
29 #define WF_REGION 0x01 // region byte follows
30 #define WF_ONECAP 0x02 // word with one capital (or all capitals)
31 #define WF_ALLCAP 0x04 // word must be all capitals
32 #define WF_RARE 0x08 // rare word
33 #define WF_BANNED 0x10 // bad word
34 #define WF_AFX 0x20 // affix ID follows
35 #define WF_FIXCAP 0x40 // keep-case word, allcap not allowed
36 #define WF_KEEPCAP 0x80 // keep-case word
37 
38 // for <flags2>, shifted up one byte to be used in wn_flags
39 #define WF_HAS_AFF 0x0100 // word includes affix
40 #define WF_NEEDCOMP 0x0200 // word only valid in compound
41 #define WF_NOSUGGEST 0x0400 // word not to be suggested
42 #define WF_COMPROOT 0x0800 // already compounded word, COMPOUNDROOT
43 #define WF_NOCOMPBEF 0x1000 // no compounding before this word
44 #define WF_NOCOMPAFT 0x2000 // no compounding after this word
45 
46 // flags for <pflags>
47 #define WFP_RARE 0x01 // rare prefix
48 #define WFP_NC 0x02 // prefix is not combining
49 #define WFP_UP 0x04 // to-upper prefix
50 #define WFP_COMPPERMIT 0x08 // prefix with COMPOUNDPERMITFLAG
51 #define WFP_COMPFORBID 0x10 // prefix with COMPOUNDFORBIDFLAG
52 
53 // Flags for postponed prefixes in "sl_pidxs". Must be above affixID (one
54 // byte) and prefcondnr (two bytes).
55 #define WF_RAREPFX (WFP_RARE << 24) // rare postponed prefix
56 #define WF_PFX_NC (WFP_NC << 24) // non-combining postponed prefix
57 #define WF_PFX_UP (WFP_UP << 24) // to-upper postponed prefix
58 #define WF_PFX_COMPPERMIT (WFP_COMPPERMIT << 24) // postponed prefix with
59  // COMPOUNDPERMITFLAG
60 #define WF_PFX_COMPFORBID (WFP_COMPFORBID << 24) // postponed prefix with
61  // COMPOUNDFORBIDFLAG
62 
63 
64 // flags for <compoptions>
65 #define COMP_CHECKDUP 1 // CHECKCOMPOUNDDUP
66 #define COMP_CHECKREP 2 // CHECKCOMPOUNDREP
67 #define COMP_CHECKCASE 4 // CHECKCOMPOUNDCASE
68 #define COMP_CHECKTRIPLE 8 // CHECKCOMPOUNDTRIPLE
69 
70 // Info from "REP", "REPSAL" and "SAL" entries in ".aff" file used in si_rep,
71 // si_repsal, sl_rep, and si_sal. Not for sl_sal!
72 // One replacement: from "ft_from" to "ft_to".
73 typedef struct fromto_S {
76 } fromto_T;
77 
78 // Info from "SAL" entries in ".aff" file used in sl_sal.
79 // The info is split for quick processing by spell_soundfold().
80 // Note that "sm_oneof" and "sm_rules" point into sm_lead.
81 typedef struct salitem_S {
82  char_u *sm_lead; // leading letters
83  int sm_leadlen; // length of "sm_lead"
84  char_u *sm_oneof; // letters from () or NULL
85  char_u *sm_rules; // rules like ^, $, priority
86  char_u *sm_to; // replacement.
87  int *sm_lead_w; // wide character copy of "sm_lead"
88  int *sm_oneof_w; // wide character copy of "sm_oneof"
89  int *sm_to_w; // wide character copy of "sm_to"
90 } salitem_T;
91 
92 typedef int salfirst_T;
93 
94 // Values for SP_*ERROR are negative, positive values are used by
95 // read_cnt_string().
96 #define SP_TRUNCERROR -1 // spell file truncated error
97 #define SP_FORMERROR -2 // format error in spell file
98 #define SP_OTHERERROR -3 // other error while reading spell file
99 
100 // Structure used to store words and other info for one language, loaded from
101 // a .spl file.
102 // The main access is through the tree in "sl_fbyts/sl_fidxs", storing the
103 // case-folded words. "sl_kbyts/sl_kidxs" is for keep-case words.
104 //
105 // The "byts" array stores the possible bytes in each tree node, preceded by
106 // the number of possible bytes, sorted on byte value:
107 // <len> <byte1> <byte2> ...
108 // The "idxs" array stores the index of the child node corresponding to the
109 // byte in "byts".
110 // Exception: when the byte is zero, the word may end here and "idxs" holds
111 // the flags, region mask and affixID for the word. There may be several
112 // zeros in sequence for alternative flag/region/affixID combinations.
113 typedef struct slang_S slang_T;
114 
115 struct slang_S {
116  slang_T *sl_next; // next language
117  char_u *sl_name; // language name "en", "en.rare", "nl", etc.
118  char_u *sl_fname; // name of .spl file
119  bool sl_add; // true if it's a .add file.
120 
121  char_u *sl_fbyts; // case-folded word bytes
122  idx_T *sl_fidxs; // case-folded word indexes
123  char_u *sl_kbyts; // keep-case word bytes
124  idx_T *sl_kidxs; // keep-case word indexes
125  char_u *sl_pbyts; // prefix tree word bytes
126  idx_T *sl_pidxs; // prefix tree word indexes
127 
128  char_u *sl_info; // infotext string or NULL
129 
130  char_u sl_regions[MAXREGIONS * 2 + 1];
131  // table with up to 8 region names plus NUL
132 
133  char_u *sl_midword; // MIDWORD string or NULL
134 
135  hashtab_T sl_wordcount; // hashtable with word count, wordcount_T
136 
137  int sl_compmax; // COMPOUNDWORDMAX (default: MAXWLEN)
138  int sl_compminlen; // COMPOUNDMIN (default: 0)
139  int sl_compsylmax; // COMPOUNDSYLMAX (default: MAXWLEN)
140  int sl_compoptions; // COMP_* flags
141  garray_T sl_comppat; // CHECKCOMPOUNDPATTERN items
142  regprog_T *sl_compprog; // COMPOUNDRULE turned into a regexp progrm
143  // (NULL when no compounding)
144  char_u *sl_comprules; // all COMPOUNDRULE concatenated (or NULL)
145  char_u *sl_compstartflags; // flags for first compound word
146  char_u *sl_compallflags; // all flags for compound words
147  bool sl_nobreak; // When true: no spaces between words
148  char_u *sl_syllable; // SYLLABLE repeatable chars or NULL
149  garray_T sl_syl_items; // syllable items
150 
151  int sl_prefixcnt; // number of items in "sl_prefprog"
152  regprog_T **sl_prefprog; // table with regprogs for prefixes
153 
154  garray_T sl_rep; // list of fromto_T entries from REP lines
155  int16_t sl_rep_first[256]; // indexes where byte first appears, -1 if
156  // there is none
157  garray_T sl_sal; // list of salitem_T entries from SAL lines
158  salfirst_T sl_sal_first[256]; // indexes where byte first appears, -1 if
159  // there is none
160  bool sl_followup; // SAL followup
161  bool sl_collapse; // SAL collapse_result
162  bool sl_rem_accents; // SAL remove_accents
163  bool sl_sofo; // SOFOFROM and SOFOTO instead of SAL items:
164  // "sl_sal_first" maps chars, when has_mbyte
165  // "sl_sal" is a list of wide char lists.
166  garray_T sl_repsal; // list of fromto_T entries from REPSAL lines
167  int16_t sl_repsal_first[256]; // sl_rep_first for REPSAL lines
168  bool sl_nosplitsugs; // don't suggest splitting a word
169  bool sl_nocompoundsugs; // don't suggest compounding
170 
171  // Info from the .sug file. Loaded on demand.
172  time_t sl_sugtime; // timestamp for .sug file
173  char_u *sl_sbyts; // soundfolded word bytes
174  idx_T *sl_sidxs; // soundfolded word indexes
175  buf_T *sl_sugbuf; // buffer with word number table
176  bool sl_sugloaded; // true when .sug file was loaded or failed to
177  // load
178 
179  bool sl_has_map; // true, if there is a MAP line
180  hashtab_T sl_map_hash; // MAP for multi-byte chars
181  int sl_map_array[256]; // MAP for first 256 chars
182  hashtab_T sl_sounddone; // table with soundfolded words that have
183  // handled, see add_sound_suggest()
184 };
185 
186 // Structure used in "b_langp", filled from 'spelllang'.
187 typedef struct langp_S {
188  slang_T *lp_slang; // info for this language
189  slang_T *lp_sallang; // language used for sound folding or NULL
190  slang_T *lp_replang; // language used for REP items or NULL
191  int lp_region; // bitmask for region or REGION_ALL
192 } langp_T;
193 
194 #define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i))
195 
196 #define VIMSUGMAGIC "VIMsug" // string at start of Vim .sug file
197 #define VIMSUGMAGICL 6
198 #define VIMSUGVERSION 1
199 
200 #define REGION_ALL 0xff // word valid in all regions
201 
202 // The tables used for recognizing word characters according to spelling.
203 // These are only used for the first 256 characters of 'encoding'.
204 typedef struct {
205  bool st_isw[256]; // flags: is word char
206  bool st_isu[256]; // flags: is uppercase char
207  char_u st_fold[256]; // chars: folded case
208  char_u st_upper[256]; // chars: upper case
209 } spelltab_T;
210 
211 // For finding suggestions: At each node in the tree these states are tried:
212 typedef enum {
213  STATE_START = 0, // At start of node check for NUL bytes (goodword
214  // ends); if badword ends there is a match, otherwise
215  // try splitting word.
216  STATE_NOPREFIX, // try without prefix
217  STATE_SPLITUNDO, // Undo splitting.
218  STATE_ENDNUL, // Past NUL bytes at start of the node.
219  STATE_PLAIN, // Use each byte of the node.
220  STATE_DEL, // Delete a byte from the bad word.
221  STATE_INS_PREP, // Prepare for inserting bytes.
222  STATE_INS, // Insert a byte in the bad word.
223  STATE_SWAP, // Swap two bytes.
224  STATE_UNSWAP, // Undo swap two characters.
225  STATE_SWAP3, // Swap two characters over three.
226  STATE_UNSWAP3, // Undo Swap two characters over three.
227  STATE_UNROT3L, // Undo rotate three characters left
228  STATE_UNROT3R, // Undo rotate three characters right
229  STATE_REP_INI, // Prepare for using REP items.
230  STATE_REP, // Use matching REP items from the .aff file.
231  STATE_REP_UNDO, // Undo a REP item replacement.
232  STATE_FINAL // End of this node.
233 } state_T;
234 
235 // Struct to keep the state at each level in suggest_try_change().
236 typedef struct trystate_S {
237  state_T ts_state; // state at this level, STATE_
238  int ts_score; // score
239  idx_T ts_arridx; // index in tree array, start of node
240  short ts_curi; // index in list of child nodes
241  char_u ts_fidx; // index in fword[], case-folded bad word
242  char_u ts_fidxtry; // ts_fidx at which bytes may be changed
243  char_u ts_twordlen; // valid length of tword[]
244  char_u ts_prefixdepth; // stack depth for end of prefix or
245  // PFD_PREFIXTREE or PFD_NOPREFIX
246  char_u ts_flags; // TSF_ flags
247  char_u ts_tcharlen; // number of bytes in tword character
248  char_u ts_tcharidx; // current byte index in tword character
249  char_u ts_isdiff; // DIFF_ values
250  char_u ts_fcharstart; // index in fword where badword char started
251  char_u ts_prewordlen; // length of word in "preword[]"
252  char_u ts_splitoff; // index in "tword" after last split
253  char_u ts_splitfidx; // "ts_fidx" at word split
254  char_u ts_complen; // nr of compound words used
255  char_u ts_compsplit; // index for "compflags" where word was spit
256  char_u ts_save_badflags; // su_badflags saved here
257  char_u ts_delidx; // index in fword for char that was deleted,
258  // valid when "ts_flags" has TSF_DIDDEL
259 } trystate_T;
260 
261 // Use our own character-case definitions, because the current locale may
262 // differ from what the .spl file uses.
263 // These must not be called with negative number!
264 #include <wchar.h> // for towupper() and towlower()
265 // Multi-byte implementation. For Unicode we can call utf_*(), but don't do
266 // that for ASCII, because we don't want to use 'casemap' here. Otherwise use
267 // the "w" library function for characters above 255.
268 #define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
269  : (c) < \
270  256 ? (int)spelltab.st_fold[c] : (int)towlower(c))
271 
272 #define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? mb_toupper(c) \
273  : (c) < \
274  256 ? (int)spelltab.st_upper[c] : (int)towupper(c))
275 
276 #define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? mb_isupper(c) \
277  : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
278 
279 // First language that is loaded, start of the linked list of loaded
280 // languages.
281 extern slang_T *first_lang;
282 
283 // file used for "zG" and "zW"
284 extern char_u *int_wordlist;
285 
286 extern spelltab_T spelltab;
287 extern int did_set_spelltab;
288 
289 extern char *e_format;
290 
291 #endif // NVIM_SPELL_DEFS_H
buf_T * sl_sugbuf
Definition: spell_defs.h:175
char_u ts_prefixdepth
Definition: spell_defs.h:244
slang_T * lp_sallang
Definition: spell_defs.h:189
char_u ts_delidx
Definition: spell_defs.h:257
Definition: spell_defs.h:236
bool sl_nocompoundsugs
Definition: spell_defs.h:169
struct langp_S langp_T
state_T
Definition: spell_defs.h:212
bool sl_sofo
Definition: spell_defs.h:163
slang_T * sl_next
Definition: spell_defs.h:116
int salfirst_T
Definition: spell_defs.h:92
char_u ts_tcharidx
Definition: spell_defs.h:248
int * sm_lead_w
Definition: spell_defs.h:87
bool sl_collapse
Definition: spell_defs.h:161
char_u * sm_lead
Definition: spell_defs.h:82
int sl_compmax
Definition: spell_defs.h:137
Definition: spell_defs.h:218
char_u * sl_midword
Definition: spell_defs.h:133
int idx_T
Definition: spell_defs.h:21
Definition: spell_defs.h:231
char_u * sm_to
Definition: spell_defs.h:86
char_u ts_save_badflags
Definition: spell_defs.h:256
slang_T * first_lang
Definition: spell.c:133
Definition: spell_defs.h:219
Definition: spell_defs.h:225
int lp_region
Definition: spell_defs.h:191
idx_T ts_arridx
Definition: spell_defs.h:239
char_u * sl_kbyts
Definition: spell_defs.h:123
char_u ts_twordlen
Definition: spell_defs.h:243
char_u ts_splitfidx
Definition: spell_defs.h:253
struct fromto_S fromto_T
int sm_leadlen
Definition: spell_defs.h:83
time_t sl_sugtime
Definition: spell_defs.h:172
char_u * ft_to
Definition: spell_defs.h:75
char_u ts_flags
Definition: spell_defs.h:246
char_u * int_wordlist
Definition: spell.c:136
Definition: spell_defs.h:232
char_u * sl_comprules
Definition: spell_defs.h:144
struct salitem_S salitem_T
short ts_curi
Definition: spell_defs.h:240
regprog_T ** sl_prefprog
Definition: spell_defs.h:152
Definition: spell_defs.h:230
int ts_score
Definition: spell_defs.h:238
regprog_T * sl_compprog
Definition: spell_defs.h:142
char_u * sm_oneof
Definition: spell_defs.h:84
char_u ts_tcharlen
Definition: spell_defs.h:247
bool sl_nosplitsugs
Definition: spell_defs.h:168
char_u ts_splitoff
Definition: spell_defs.h:252
Definition: spell_defs.h:222
bool sl_followup
Definition: spell_defs.h:160
garray_T sl_repsal
Definition: spell_defs.h:166
garray_T sl_sal
Definition: spell_defs.h:157
Definition: spell_defs.h:81
char_u ts_prewordlen
Definition: spell_defs.h:251
bool sl_add
Definition: spell_defs.h:119
char_u * sl_fname
Definition: spell_defs.h:118
char_u ts_complen
Definition: spell_defs.h:254
state_T ts_state
Definition: spell_defs.h:237
char_u ts_isdiff
Definition: spell_defs.h:249
char_u * sl_name
Definition: spell_defs.h:117
idx_T * sl_fidxs
Definition: spell_defs.h:122
Definition: spell_defs.h:227
Definition: spell_defs.h:115
Definition: spell_defs.h:221
char_u * sl_pbyts
Definition: spell_defs.h:125
Definition: spell_defs.h:204
struct trystate_S trystate_T
int sl_compminlen
Definition: spell_defs.h:138
idx_T * sl_sidxs
Definition: spell_defs.h:174
slang_T * lp_slang
Definition: spell_defs.h:188
Definition: buffer_defs.h:469
Definition: spell_defs.h:226
Definition: spell_defs.h:229
Definition: spell_defs.h:224
Definition: garray.h:12
char_u * sl_info
Definition: spell_defs.h:128
garray_T sl_syl_items
Definition: spell_defs.h:149
Definition: spell_defs.h:213
Definition: spell_defs.h:187
bool sl_has_map
Definition: spell_defs.h:179
char * e_format
Definition: spell.c:331
char_u ts_fcharstart
Definition: spell_defs.h:250
int sl_compoptions
Definition: spell_defs.h:140
char_u * sm_rules
Definition: spell_defs.h:85
Definition: spell_defs.h:217
Definition: hashtab.h:62
int sl_compsylmax
Definition: spell_defs.h:139
bool sl_rem_accents
Definition: spell_defs.h:162
idx_T * sl_pidxs
Definition: spell_defs.h:126
Definition: spell_defs.h:223
bool sl_nobreak
Definition: spell_defs.h:147
garray_T sl_rep
Definition: spell_defs.h:154
char_u * sl_compallflags
Definition: spell_defs.h:146
unsigned char char_u
Definition: types.h:11
char_u * sl_compstartflags
Definition: spell_defs.h:145
Definition: spell_defs.h:228
Definition: spell_defs.h:220
int did_set_spelltab
Definition: spell.c:289
garray_T sl_comppat
Definition: spell_defs.h:141
char_u * ft_from
Definition: spell_defs.h:74
Definition: spell_defs.h:216
#define MAXREGIONS
Definition: spell_defs.h:17
slang_T * lp_replang
Definition: spell_defs.h:190
bool sl_sugloaded
Definition: spell_defs.h:176
idx_T * sl_kidxs
Definition: spell_defs.h:124
int * sm_oneof_w
Definition: spell_defs.h:88
Definition: regexp_defs.h:70
hashtab_T sl_wordcount
Definition: spell_defs.h:135
char_u ts_fidxtry
Definition: spell_defs.h:242
char_u ts_fidx
Definition: spell_defs.h:241
int sl_prefixcnt
Definition: spell_defs.h:151
spelltab_T spelltab
Definition: spell.c:288
char_u * sl_sbyts
Definition: spell_defs.h:173
Definition: spell_defs.h:73
int * sm_to_w
Definition: spell_defs.h:89
char_u * sl_syllable
Definition: spell_defs.h:148
hashtab_T sl_map_hash
Definition: spell_defs.h:180
hashtab_T sl_sounddone
Definition: spell_defs.h:182
char_u * sl_fbyts
Definition: spell_defs.h:121
char_u ts_compsplit
Definition: spell_defs.h:255