Python|utf8字符串截取

utf8字符串是变长字符串,在日常处理时字符串截取时,如果处理不好则会出现乱码,针对这个问题下面给出一个通用utf8字符串截取的示例,下面给出的是用python和php实现根据原理可以用任何语言来实现.
# python
# utf8 substr
def safestrlength_utf8(sourcestr): # {{{
i = 0;
n = 0;
str_length = len(sourcestr);

while i < str_length:
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
i = i + 6
elif ascnum >= 248:
i = i + 5
elif ascnum >= 240:
i = i + 4
elif ascnum >= 224:
i = i + 3
elif ascnum >= 192:
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
i = i + 1
else:
i = i + 1
n = n + 1
n = n - 1
return n

# utf8 string length
def safesubstr_utf8(sourcestr, cutlength):
returnlist = []
i = 0
n = 0
str_length = len(sourcestr)

while (n < cutlength) and (i <= str_length):
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
returnlist.append(sourcestr[i:i+6])
i = i + 6
elif ascnum >= 248:
returnlist.append(sourcestr[i:i+5])
i = i + 5
elif ascnum >= 240:
returnlist.append(sourcestr[i:i+4])
i = i + 4
elif ascnum >= 224:
returnlist.append(sourcestr[i:i+3])
i = i + 3
elif ascnum >= 192:
returnlist.append(sourcestr[i:i+2])
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
returnlist.append(sourcestr[i:i+1])
i = i + 1
else:
returnlist.append(sourcestr[i:i+1])
i = i + 1
n = n + 1;
return "".join(returnlist)

// php
// substr for utf8 string, then utf8 word is 1 length
public static function safesubstr_utf8($sourcestr, $cutlength) // {{{
{
$returnstr = '';
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);

while(($n < $cutlength) && ($i <= $str_length))
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$returnstr = $returnstr.substr($sourcestr, $i, 6);
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$returnstr = $returnstr.substr($sourcestr, $i, 5);
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$returnstr = $returnstr.substr($sourcestr, $i, 4);
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$returnstr = $returnstr.substr($sourcestr, $i, 3);
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$returnstr = $returnstr.substr($sourcestr, $i, 2);
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
else
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
$n++;
}

return $returnstr;
} // }}}

// get length for utf8 string, then utf8 word is 1 length
public static function safestrlength_utf8($sourcestr) // {{{
{
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);

while($i <= $str_length)
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$i = $i + 1;
}
else
{
$i = $i + 1;
}
$n++;
}
$n--;
return $n;
} // }}}

U-00000000 - U-0000007F:
0xxxxxxx
U-00000080 - U-000007FF:
【Python|utf8字符串截取】110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF:
1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF:
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF:
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

    推荐阅读