Python|utf8字符串截取
utf8字符串是变长字符串,在日常处理时字符串截取时,如果处理不好则会出现乱码,针对这个问题下面给出一个通用utf8字符串截取的示例,下面给出的是用python和php实现根据原理可以用任何语言来实现.
# python
# utf8 substr
def safestrlength_utf8(sourcestr): # {{{
i = 0;
n = 0;
str_length = len(sourcestr);
while i < str_length:
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
i = i + 6
elif ascnum >= 248:
i = i + 5
elif ascnum >= 240:
i = i + 4
elif ascnum >= 224:
i = i + 3
elif ascnum >= 192:
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
i = i + 1
else:
i = i + 1
n = n + 1
n = n - 1
return n
# utf8 string length
def safesubstr_utf8(sourcestr, cutlength):
returnlist = []
i = 0
n = 0
str_length = len(sourcestr)
while (n < cutlength) and (i <= str_length):
temp_str = sourcestr[i:i+1]
ascnum = ord(temp_str)
if ascnum >= 252:
returnlist.append(sourcestr[i:i+6])
i = i + 6
elif ascnum >= 248:
returnlist.append(sourcestr[i:i+5])
i = i + 5
elif ascnum >= 240:
returnlist.append(sourcestr[i:i+4])
i = i + 4
elif ascnum >= 224:
returnlist.append(sourcestr[i:i+3])
i = i + 3
elif ascnum >= 192:
returnlist.append(sourcestr[i:i+2])
i = i + 2
elif ascnum >= 65 and ascnum <= 90:
returnlist.append(sourcestr[i:i+1])
i = i + 1
else:
returnlist.append(sourcestr[i:i+1])
i = i + 1
n = n + 1;
return "".join(returnlist)
// php
// substr for utf8 string, then utf8 word is 1 length
public static function safesubstr_utf8($sourcestr, $cutlength) // {{{
{
$returnstr = '';
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);
while(($n < $cutlength) && ($i <= $str_length))
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$returnstr = $returnstr.substr($sourcestr, $i, 6);
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$returnstr = $returnstr.substr($sourcestr, $i, 5);
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$returnstr = $returnstr.substr($sourcestr, $i, 4);
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$returnstr = $returnstr.substr($sourcestr, $i, 3);
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$returnstr = $returnstr.substr($sourcestr, $i, 2);
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
else
{
$returnstr = $returnstr.substr($sourcestr, $i, 1);
$i = $i + 1;
}
$n++;
}
return $returnstr;
} // }}}
// get length for utf8 string, then utf8 word is 1 length
public static function safestrlength_utf8($sourcestr) // {{{
{
$i = 0;
$n = 0;
$str_length = strlen($sourcestr);
while($i <= $str_length)
{
$temp_str = substr($sourcestr, $i, 1);
$ascnum = Ord($temp_str);
if($ascnum >= 252)
{
$i = $i + 6;
}
elseif($ascnum >= 248)
{
$i = $i + 5;
}
elseif($ascnum >= 240)
{
$i = $i + 4;
}
elseif($ascnum >= 224)
{
$i = $i + 3;
}
elseif($ascnum >= 192)
{
$i = $i + 2;
}
elseif($ascnum >= 65 && $ascnum <= 90)
{
$i = $i + 1;
}
else
{
$i = $i + 1;
}
$n++;
}
$n--;
return $n;
} // }}}
U-00000000 - U-0000007F: |
0xxxxxxx |
U-00000080 - U-000007FF: |
【Python|utf8字符串截取】110xxxxx 10xxxxxx |
U-00000800 - U-0000FFFF: |
1110xxxx 10xxxxxx 10xxxxxx |
U-00010000 - U-001FFFFF: |
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
U-00200000 - U-03FFFFFF: |
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
U-04000000 - U-7FFFFFFF: |
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
推荐阅读
- python学习之|python学习之 实现QQ自动发送消息
- 逻辑回归的理解与python示例
- 一起来学习C语言的字符串转换函数
- python自定义封装带颜色的logging模块
- 【Leetcode/Python】001-Two|【Leetcode/Python】001-Two Sum
- Python基础|Python基础 - 练习1
- Python爬虫|Python爬虫 --- 1.4 正则表达式(re库)
- Python(pathlib模块)
- python青少年编程比赛_第十一届蓝桥杯大赛青少年创意编程组比赛细则
- Python数据分析(一)(Matplotlib使用)