Counting UTF-8 bytes, code points and grapheme clusters
Strings to test:
• Star emoji
- UTF-8 bytes: 3
- code points: 1
- grapheme clusters: 1
• Heart emoji
- UTF-8 bytes: 6
- code points: 2
- grapheme clusters: 1
• England flag emoji
- UTF-8 bytes: 28
- code points: 7
- grapheme clusters: 1
• Lenny Face
( ͡° ͜ʖ ͡°)
- UTF-8 bytes: 17
- code points: 11
- grapheme clusters: 8
• Zalgo
Ḣ̸̠͈͋͠͝ẽ̶̮l̶̜̖̗͒̂͝l̶̪̰͎̠͋̆ǫ̷̙͍̻̈́̓͒̒ ̸̘̟̭͈̐̉w̶͉͍̤͌ö̶̤̫̐r̷͎̗̄l̸̢̰̝̈́͆̀̍d̷̻̽̈̓
- UTF-8 bytes: 145
- code points: 78
- grapheme clusters: 11
$ str = '🏴 ' ;
// Count UTF-8 bytes
echo strlen ($ str ) . "\n" ;
// Count code points
echo mb_strlen ($ str ) . "\n" ; // require ext-mbstring
// Count grapheme clusters
echo grapheme_strlen ($ str ) . "\n" ; // require ext-intl
// Alternative to count grapheme clusters
echo (count (preg_split ('/\X/u ' , $ str )) - 1 ) . "\n" ;
PHP with Symfony Framework
// composer require symfony/string
use Symfony \Component \String \ByteString ;
use Symfony \Component \String \CodePointString ;
use Symfony \Component \String \UnicodeString ;
$ str = '🏴 ' ;
// Count UTF-8 bytes
echo (new ByteString ($ str ))->length () . "\n" ;
// Count code points
echo (new CodePointString ($ str ))->length () . "\n" ;
// Count grapheme clusters
echo (new UnicodeString ($ str ))->length () . "\n" ;
str=' 🏴'
# Count UTF-8 bytes
echo -n " $str " | wc -c # To view the bytes: echo -n "$str" | hd
# Count code points
echo -n " $str " | wc -m # To view the code points: unicode --brief "$str"
-- Count UTF-8 bytes
SELECT LENGTH(' 🏴' ) AS bytes;
-- Count code points
SELECT CHAR_LENGTH(' 🏴' ) AS code_points;
-- Count UTF-8 bytes
SELECT LENGTH(CAST(' 🏴' AS bytea )) AS bytes;
SELECT LENGTH(' 🏴' ::bytea ) AS bytes;
-- Count code points
SELECT LENGTH(' 🏴' ) AS code_points;
-- Count UTF-8 bytes
SELECT LENGTH(CAST(' 🏴' AS blob)) AS bytes;
-- Count code points
SELECT LENGTH(' 🏴' ) AS code_points;
str = '🏴'
# Count UTF-8 bytes
print (len (str .encode ('utf-8' )))
# Count code points
print (len (str ))
# Count grapheme clusters
print (grapheme .length (str )) # pip install grapheme
let str = '🏴' ; // Stored in memory as UTF-16BE
// Count UTF-8 bytes
const encoder = new TextEncoder ( ) ;
console . log ( encoder . encode ( str ) . length ) ;
// Count UTF-8 bytes (alternative)
const utf8 = require ( 'utf8' ) ; // npm install utf8
console . log ( utf8 . encode ( str ) . length ) ;
// Count code points
console . log ( [ ...str ] . length ) ;
// Count code points (alternative)
const punycode = require ( 'punycode/' ) ; // npm install punycode
console . log ( punycode . ucs2 . decode ( str ) . length ) ;
// Count grapheme clusters
const GraphemeSplitter = require ( 'grapheme-splitter' ) ; // npm install grapheme-splitter
console . log ( new GraphemeSplitter ( ) . splitGraphemes ( str ) . length ) ;