summaryrefslogtreecommitdiffstats
path: root/src/utfsize.c
blob: 504c147795ca7bcd04319bac3ae9e07f857b850a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*

    File: utfsize.c

    Copyright (C) 2021 Christophe GRENIER <grenier@cgsecurity.org>

    This software is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write the Free Software Foundation, Inc., 51
    Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

 */

#if !defined(SINGLE_FORMAT) || defined(SINGLE_FORMAT_txt) || defined(SINGLE_FORMAT_win)
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#include "log.h"
#include "utfsize.h"

int UTFsize(const unsigned char *buffer, const unsigned int buf_len)
{
  const unsigned char *p=buffer;	/* pointers to actual position in source buffer */
  unsigned int i=0;
  /*@
    @ loop invariant 0 <= i < buf_len + 3;
    @ loop invariant p == buffer + i;
    @ loop assigns i, p;
    @ loop variant buf_len - 1 - i;
    @*/
  while(i<buf_len)
  {
    /*@ assert i < buf_len; */
    /*@ assert p == buffer + i; */
    const unsigned char c=*p;
    if(c=='\0')
      return i;
    /* Reject some invalid UTF-8 sequences */
    if(c==0xc0 || c==0xc1 || c==0xf7 || c>=0xfd)
      return i;
    /*@ assert i + 1 >= buf_len || \valid_read(p+1); */
    /*@ assert i + 2 >= buf_len || \valid_read(p+2); */
    if((c & 0xf0)==0xe0 &&
	(i+1 >= buf_len || (*(p+1) & 0xc0)==0x80) &&
	(i+2 >= buf_len || (*(p+2) & 0xc0)==0x80))
    { /* UTF8 l=3 */
#ifdef DEBUG_TXT
      log_info("UTFsize i=%u l=3\n", i);
#endif
      p+=3;
      i+=3;
    }
    else if((c & 0xe0)==0xc0 &&
	(i+1 >= buf_len || (*(p+1) & 0xc0)==0x80))
    { /* UTF8 l=2 */
#ifdef DEBUG_TXT
      log_info("UTFsize i=%u l=2\n", i);
#endif
      p+=2;
      i+=2;
    }
    else
    { /* Ascii UCS */
#ifdef DEBUG_TXT
      log_info("UTFsize i=%u l=1 ? *p=%c\n", i, c);
#endif
      switch(c)
      {
	case 0x00:
	case 0x01:
	case 0x02:
	case 0x03:
	case 0x04:
	case 0x05:
	case 0x06:
	case 0x07:
	case 0x0b:
	case 0x0c:
	case 0x10:
	case 0x11:
	case 0x12:
	case 0x13:
	case 0x14:
	case 0x15:
	case 0x16:
	case 0x17:
	case 0x18:
	case 0x19:
	case 0x1a:
	case 0x1b:
	case 0x1c:
	case 0x1d:
	case 0x1e:
	case 0x1f:
	case 0x7f:
	  return i;
      }
      p++;
      i++;
    }
  }
  return (i<buf_len?i:buf_len);
}
#endif