suricata
util-memcmp.h
Go to the documentation of this file.
1 /* Copyright (C) 2007-2013 Open Information Security Foundation
2  *
3  * You can copy, redistribute or modify this Program under the terms of
4  * the GNU General Public License version 2 as published by the Free
5  * Software Foundation.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * version 2 along with this program; if not, write to the Free Software
14  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
15  * 02110-1301, USA.
16  */
17 
18 /**
19  * \file
20  *
21  * \author Victor Julien <victor@inliniac.net>
22  *
23  * Memcmp implementations for SSE3, SSE4.1, SSE4.2 and TILE-Gx SIMD.
24  *
25  * Both SCMemcmp and SCMemcmpLowercase return 0 on a exact match,
26  * 1 on a failed match.
27  */
28 
29 #ifndef __UTIL_MEMCMP_H__
30 #define __UTIL_MEMCMP_H__
31 
32 #include "util-optimize.h"
33 
34 /** \brief compare two patterns, converting the 2nd to lowercase
35  * \warning *ONLY* the 2nd pattern is converted to lowercase
36  */
37 static inline int SCMemcmpLowercase(const void *, const void *, size_t);
38 
39 void MemcmpRegisterTests(void);
40 
41 static inline int
42 MemcmpLowercase(const void *s1, const void *s2, size_t n)
43 {
44  ssize_t i;
45 
46  /* check backwards because we already tested the first
47  * 2 to 4 chars. This way we are more likely to detect
48  * a miss and thus speed up a little... */
49  for (i = n - 1; i >= 0; i--) {
50  if (((uint8_t *)s1)[i] != u8_tolower(*(((uint8_t *)s2)+i)))
51  return 1;
52  }
53 
54  return 0;
55 }
56 
57 #if defined(__SSE4_2__)
58 
59 #include <nmmintrin.h>
60 
61 /* No SIMD support, fall back to plain memcmp and a home grown lowercase one */
62 
63 static inline int SCMemcmp(const void *s1, const void *s2, size_t n)
64 {
65  __m128i b1, b2;
66 
67  int r;
68  /* counter for how far we already matched in the buffer */
69  size_t m = 0;
70 
71  do {
72  /* apparently we can't just read 16 bytes even though
73  * it almost always works fine :) */
74  if (likely(n - m < 16)) {
75  return memcmp(s1, s2, n - m) ? 1 : 0;
76  }
77 
78  /* load the buffers into the 128bit vars */
79  b1 = _mm_loadu_si128((const __m128i *) s1);
80  b2 = _mm_loadu_si128((const __m128i *) s2);
81 
82  /* do the actual compare */
83  m += (r = _mm_cmpestri(b1, n - m, b2, 16,
84  _SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY));
85 
86  s1 += 16;
87  s2 += 16;
88  } while (r == 16);
89 
90  return ((m == n) ? 0 : 1);
91 }
92 
93 /* Range of values of uppercase characters. We only use the first 2 bytes. */
94 static char scmemcmp_uppercase[16] __attribute__((aligned(16))) = {
95  'A', 'Z', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, };
96 
97 /** \brief compare two buffers in a case insensitive way
98  * \param s1 buffer already in lowercase
99  * \param s2 buffer with mixed upper and lowercase
100  */
101 static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t n)
102 {
103  __m128i b1, b2, mask;
104 
105  int r;
106  /* counter for how far we already matched in the buffer */
107  size_t m = 0;
108 
109  __m128i ucase = _mm_load_si128((const __m128i *) scmemcmp_uppercase);
110  __m128i nulls = _mm_setzero_si128();
111  __m128i uplow = _mm_set1_epi8(0x20);
112 
113  do {
114  /* apparently we can't just read 16 bytes even though
115  * it almost always works fine :) */
116  if (likely(n - m < 16)) {
117  return MemcmpLowercase(s1, s2, n - m);
118  }
119 
120  b1 = _mm_loadu_si128((const __m128i *) s1);
121  b2 = _mm_loadu_si128((const __m128i *) s2);
122  size_t len = n - m;
123 
124  /* The first step is creating a mask that is FF for all uppercase
125  * characters, 00 for all others */
126  mask = _mm_cmpestrm(ucase, 2, b2, len, _SIDD_CMP_RANGES | _SIDD_UNIT_MASK);
127  /* Next we use that mask to create a new: this one has 0x20 for
128  * the uppercase chars, 00 for all other. */
129  mask = _mm_blendv_epi8(nulls, uplow, mask);
130  /* finally, merge the mask and the buffer converting the
131  * uppercase to lowercase */
132  b2 = _mm_add_epi8(b2, mask);
133 
134  /* search using our converted buffer */
135  m += (r = _mm_cmpestri(b1, len, b2, 16,
136  _SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY));
137 
138  s1 += 16;
139  s2 += 16;
140  } while (r == 16);
141 
142  return ((m == n) ? 0 : 1);
143 }
144 
145 #elif defined(__SSE4_1__)
146 
147 #include <smmintrin.h>
148 
149 #define SCMEMCMP_BYTES 16
150 
151 static inline int SCMemcmp(const void *s1, const void *s2, size_t len)
152 {
153  size_t offset = 0;
154  __m128i b1, b2, c;
155 
156  do {
157  /* apparently we can't just read 16 bytes even though
158  * it almost always works fine :) */
159  if (likely(len - offset < 16)) {
160  return memcmp(s1, s2, len - offset) ? 1 : 0;
161  }
162 
163  /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using
164  * _mm_lddqu_si128 was about 2% slower even though it's supposed to
165  * be faster. */
166  b1 = _mm_loadu_si128((const __m128i *) s1);
167  b2 = _mm_loadu_si128((const __m128i *) s2);
168  c = _mm_cmpeq_epi8(b1, b2);
169 
170  int diff = len - offset;
171  if (diff < 16) {
172  int rmask = ~(0xFFFFFFFF << diff);
173 
174  if ((_mm_movemask_epi8(c) & rmask) != rmask) {
175  return 1;
176  }
177  } else {
178  if (_mm_movemask_epi8(c) != 0x0000FFFF) {
179  return 1;
180  }
181  }
182 
183  offset += SCMEMCMP_BYTES;
184  s1 += SCMEMCMP_BYTES;
185  s2 += SCMEMCMP_BYTES;
186  } while (len > offset);
187 
188  return 0;
189 }
190 
191 #define UPPER_LOW 0x40 /* "A" - 1 */
192 #define UPPER_HIGH 0x5B /* "Z" + 1 */
193 
194 static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len)
195 {
196  size_t offset = 0;
197  __m128i b1, b2, mask1, mask2, upper1, upper2, nulls, uplow;
198 
199  /* setup registers for upper to lower conversion */
200  upper1 = _mm_set1_epi8(UPPER_LOW);
201  upper2 = _mm_set1_epi8(UPPER_HIGH);
202  nulls = _mm_setzero_si128();
203  uplow = _mm_set1_epi8(0x20);
204 
205  do {
206  /* apparently we can't just read 16 bytes even though
207  * it almost always works fine :) */
208  if (likely(len - offset < 16)) {
209  return MemcmpLowercase(s1, s2, len - offset);
210  }
211 
212  /* unaligned loading of the bytes to compare */
213  b1 = _mm_loadu_si128((const __m128i *) s1);
214  b2 = _mm_loadu_si128((const __m128i *) s2);
215 
216  /* mark all chars bigger than upper1 */
217  mask1 = _mm_cmpgt_epi8(b2, upper1);
218  /* mark all chars lower than upper2 */
219  mask2 = _mm_cmplt_epi8(b2, upper2);
220  /* merge the two, leaving only those that are true in both */
221  mask1 = _mm_cmpeq_epi8(mask1, mask2);
222  /* Next we use that mask to create a new: this one has 0x20 for
223  * the uppercase chars, 00 for all other. */
224  mask1 = _mm_blendv_epi8(nulls, uplow, mask1);
225 
226  /* add to b2, converting uppercase to lowercase */
227  b2 = _mm_add_epi8(b2, mask1);
228 
229  /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */
230  mask1 = _mm_cmpeq_epi8(b1, b2);
231 
232  int diff = len - offset;
233  if (diff < 16) {
234  int rmask = ~(0xFFFFFFFF << diff);
235 
236  if ((_mm_movemask_epi8(mask1) & rmask) != rmask) {
237  return 1;
238  }
239  } else {
240  if (_mm_movemask_epi8(mask1) != 0x0000FFFF) {
241  return 1;
242  }
243  }
244 
245  offset += SCMEMCMP_BYTES;
246  s1 += SCMEMCMP_BYTES;
247  s2 += SCMEMCMP_BYTES;
248  } while (len > offset);
249 
250  return 0;
251 }
252 
253 
254 
255 #elif defined(__SSE3__)
256 
257 #include <pmmintrin.h> /* for SSE3 */
258 
259 #define SCMEMCMP_BYTES 16
260 
261 static inline int SCMemcmp(const void *s1, const void *s2, size_t len)
262 {
263  size_t offset = 0;
264  __m128i b1, b2, c;
265 
266  do {
267  /* apparently we can't just read 16 bytes even though
268  * it almost always works fine :) */
269  if (likely(len - offset < 16)) {
270  return memcmp(s1, s2, len - offset) ? 1 : 0;
271  }
272 
273  /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using
274  * _mm_lddqu_si128 was about 2% slower even though it's supposed to
275  * be faster. */
276  b1 = _mm_loadu_si128((const __m128i *) s1);
277  b2 = _mm_loadu_si128((const __m128i *) s2);
278  c = _mm_cmpeq_epi8(b1, b2);
279 
280  int diff = len - offset;
281  if (diff < 16) {
282  int rmask = ~(0xFFFFFFFF << diff);
283 
284  if ((_mm_movemask_epi8(c) & rmask) != rmask) {
285  return 1;
286  }
287  } else {
288  if (_mm_movemask_epi8(c) != 0x0000FFFF) {
289  return 1;
290  }
291  }
292 
293  offset += SCMEMCMP_BYTES;
294  s1 += SCMEMCMP_BYTES;
295  s2 += SCMEMCMP_BYTES;
296  } while (len > offset);
297 
298  return 0;
299 }
300 
301 #define UPPER_LOW 0x40 /* "A" - 1 */
302 #define UPPER_HIGH 0x5B /* "Z" + 1 */
303 #define UPPER_DELTA 0xDF /* 0xFF - 0x20 */
304 
305 static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len)
306 {
307  size_t offset = 0;
308  __m128i b1, b2, mask1, mask2, upper1, upper2, delta;
309 
310  /* setup registers for upper to lower conversion */
311  upper1 = _mm_set1_epi8(UPPER_LOW);
312  upper2 = _mm_set1_epi8(UPPER_HIGH);
313  delta = _mm_set1_epi8(UPPER_DELTA);
314 
315  do {
316  /* apparently we can't just read 16 bytes even though
317  * it almost always works fine :) */
318  if (likely(len - offset < 16)) {
319  return MemcmpLowercase(s1, s2, len - offset);
320  }
321 
322  /* unaligned loading of the bytes to compare */
323  b1 = _mm_loadu_si128((const __m128i *) s1);
324  b2 = _mm_loadu_si128((const __m128i *) s2);
325 
326  /* mark all chars bigger than upper1 */
327  mask1 = _mm_cmpgt_epi8(b2, upper1);
328  /* mark all chars lower than upper2 */
329  mask2 = _mm_cmplt_epi8(b2, upper2);
330  /* merge the two, leaving only those that are true in both */
331  mask1 = _mm_cmpeq_epi8(mask1, mask2);
332 
333  /* sub delta leaves 0x20 only for uppercase positions, the
334  rest is 0x00 due to the saturation (reuse mask1 reg)*/
335  mask1 = _mm_subs_epu8(mask1, delta);
336 
337  /* add to b2, converting uppercase to lowercase */
338  b2 = _mm_add_epi8(b2, mask1);
339 
340  /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */
341  mask1 = _mm_cmpeq_epi8(b1, b2);
342 
343  int diff = len - offset;
344  if (diff < 16) {
345  int rmask = ~(0xFFFFFFFF << diff);
346 
347  if ((_mm_movemask_epi8(mask1) & rmask) != rmask) {
348  return 1;
349  }
350  } else {
351  if (_mm_movemask_epi8(mask1) != 0x0000FFFF) {
352  return 1;
353  }
354  }
355 
356  offset += SCMEMCMP_BYTES;
357  s1 += SCMEMCMP_BYTES;
358  s2 += SCMEMCMP_BYTES;
359  } while (len > offset);
360 
361  return 0;
362 }
363 
364 #elif defined(__tile__)
365 
366 #include <ctype.h>
367 
368 /* Compare to non-zero sequence of bytes. */
369 static inline int SCMemcmpNZ(const void *s1, const void *s2, size_t len)
370 {
371  uint64_t b1, w1, aligned1;
372  uint64_t b2, w2, aligned2;
373 
374  /* Load aligned words containing the beginning of each string.
375  * These loads don't trigger unaligned events.
376  */
377  w1 = __insn_ldna(s1);
378  w2 = __insn_ldna(s2);
379  /* Can't just read next 8 bytes because it might go past the end
380  * of a page. */
381  while (len > 8) {
382  /* Here, the buffer extends into the next word by at least one
383  * byte, so it is safe to read the next word. Do an aligned
384  * loads on the next word. Then use the two words to create
385  * an aligned word from each string. */
386  b1 = __insn_ldna(s1 + 8);
387  b2 = __insn_ldna(s2 + 8);
388  aligned1 = __insn_dblalign(w1, b1, s1);
389  aligned2 = __insn_dblalign(w2, b2, s2);
390  if (aligned1 != aligned2)
391  return 1;
392 
393  /* Move forward one word (8 bytes) */
394  w1 = b1;
395  w2 = b2;
396  len -= 8;
397  s1 += 8;
398  s2 += 8;
399  }
400  /* Process the last up-to 8 bytes. */
401  do {
402  if (*(char*)s1 != *(char*)s2)
403  return 1;
404  s1++;
405  s2++;
406  len--;
407  } while (len);
408 
409  return 0;
410 }
411 
412 /* Compare two sequences of bytes. */
413 static inline int SCMemcmp(const void *s1, const void *s2, size_t len)
414 {
415  if (len == 0)
416  return 0;
417  return SCMemcmpNZ(s1, s2, len);
418 }
419 /** \brief Convert 8 characters to lower case using SIMD.
420  * \param Word containing the 8 bytes.
421  * \return Word containing 8-bytes each converted to lowercase.
422  */
423 static inline uint64_t
424 vec_tolower(uint64_t cc)
425 {
426  /* For Uppercases letters, add 32 to convert to lower case. */
427  uint64_t less_than_eq_Z = __insn_v1cmpltui (cc, 'Z' + 1);
428  uint64_t less_than_A = __insn_v1cmpltui (cc, 'A');
429  uint64_t is_upper = __insn_v1cmpne (less_than_eq_Z, less_than_A);
430  return __insn_v1add (cc,__insn_v1shli (is_upper, 5));
431 }
432 
433 /** \brief compare two buffers in a case insensitive way
434  * \param s1 buffer already in lowercase
435  * \param s2 buffer with mixed upper and lowercase
436  */
437 static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len)
438 {
439  uint64_t b1, w1, aligned1;
440  uint64_t b2, w2, aligned2;
441 
442  if (len == 0)
443  return 0;
444 
445  /* TODO Check for already aligned cases. To optimize. */
446 
447  /* Load word containing the beginning of each string.
448  * These loads don't trigger unaligned events.
449  */
450  w1 = __insn_ldna(s1);
451  w2 = __insn_ldna(s2);
452  /* Can't just read next 8 bytes because it might go past the end
453  * of a page. */
454  while (len > 8) {
455  /* Here, the buffer extends into the next word by at least one
456  * byte, so it is safe to read the next word. Do aligned
457  * loads on next word. Then use the two words to create an
458  * aligned word from each string. */
459  b1 = __insn_ldna(s1 + 8);
460  b2 = __insn_ldna(s2 + 8);
461  aligned1 = __insn_dblalign(w1, b1, s1);
462  aligned2 = vec_tolower(__insn_dblalign(w2, b2, s2));
463  if (aligned1 != aligned2)
464  return 1;
465 
466  /* Move forward one word (8 bytes) */
467  w1 = b1;
468  w2 = b2;
469  len -= 8;
470  s1 += 8;
471  s2 += 8;
472  }
473 
474  do {
475  if (*(char*)s1 != tolower(*(char*)s2))
476  return 1;
477  s1++;
478  s2++;
479  len--;
480  } while (len);
481 
482  return 0;
483 }
484 
485 #else
486 
487 /* No SIMD support, fall back to plain memcmp and a home grown lowercase one */
488 
489 /* wrapper around memcmp to match the retvals of the SIMD implementations */
490 #define SCMemcmp(a,b,c) ({ \
491  memcmp((a), (b), (c)) ? 1 : 0; \
492 })
493 
494 static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len)
495 {
496  return MemcmpLowercase(s1, s2, len);
497 }
498 
499 #endif /* SIMD */
500 
501 #endif /* __UTIL_MEMCMP_H__ */
502 
#define SCMemcmp(a, b, c)
Definition: util-memcmp.h:490
void MemcmpRegisterTests(void)
Definition: util-memcmp.c:384
uint64_t offset
typedef __attribute__
DNP3 application header.
#define u8_tolower(c)
Definition: suricata.h:176
SCMutex m
Definition: flow-hash.h:105
uint8_t len
#define likely(expr)
Definition: util-optimize.h:32