use std::error;
use std::fmt;
use std::result;
use hir;
pub type Result<T> = result::Result<T, Error>;
type Range = &'static [(char, char)];
#[derive(Debug)]
pub enum Error {
PropertyNotFound,
PropertyValueNotFound,
#[allow(dead_code)]
PerlClassNotFound,
}
pub type FoldResult<T> = result::Result<T, CaseFoldError>;
#[derive(Debug)]
pub struct CaseFoldError(());
impl error::Error for CaseFoldError {}
impl fmt::Display for CaseFoldError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"Unicode-aware case folding is not available \
(probably because the unicode-case feature is not enabled)"
)
}
}
#[derive(Debug)]
pub struct UnicodeWordError(());
impl error::Error for UnicodeWordError {}
impl fmt::Display for UnicodeWordError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"Unicode-aware \\w class is not available \
(probably because the unicode-perl feature is not enabled)"
)
}
}
pub fn simple_fold(
c: char,
) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
#[cfg(not(feature = "unicode-case"))]
fn imp(
_: char,
) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
{
use std::option::IntoIter;
Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
}
#[cfg(feature = "unicode-case")]
fn imp(
c: char,
) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
{
use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
Ok(CASE_FOLDING_SIMPLE
.binary_search_by_key(&c, |&(c1, _)| c1)
.map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c))
.map_err(|i| {
if i >= CASE_FOLDING_SIMPLE.len() {
None
} else {
Some(CASE_FOLDING_SIMPLE[i].0)
}
}))
}
imp(c)
}
pub fn contains_simple_case_mapping(
start: char,
end: char,
) -> FoldResult<bool> {
#[cfg(not(feature = "unicode-case"))]
fn imp(_: char, _: char) -> FoldResult<bool> {
Err(CaseFoldError(()))
}
#[cfg(feature = "unicode-case")]
fn imp(start: char, end: char) -> FoldResult<bool> {
use std::cmp::Ordering;
use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
assert!(start <= end);
Ok(CASE_FOLDING_SIMPLE
.binary_search_by(|&(c, _)| {
if start <= c && c <= end {
Ordering::Equal
} else if c > end {
Ordering::Greater
} else {
Ordering::Less
}
})
.is_ok())
}
imp(start, end)
}
#[derive(Debug)]
pub enum ClassQuery<'a> {
OneLetter(char),
Binary(&'a str),
ByValue {
property_name: &'a str,
property_value: &'a str,
},
}
impl<'a> ClassQuery<'a> {
fn canonicalize(&self) -> Result<CanonicalClassQuery> {
match *self {
ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
ClassQuery::Binary(name) => self.canonical_binary(name),
ClassQuery::ByValue { property_name, property_value } => {
let property_name = symbolic_name_normalize(property_name);
let property_value = symbolic_name_normalize(property_value);
let canon_name = match canonical_prop(&property_name)? {
None => return Err(Error::PropertyNotFound),
Some(canon_name) => canon_name,
};
Ok(match canon_name {
"General_Category" => {
let canon = match canonical_gencat(&property_value)? {
None => return Err(Error::PropertyValueNotFound),
Some(canon) => canon,
};
CanonicalClassQuery::GeneralCategory(canon)
}
"Script" => {
let canon = match canonical_script(&property_value)? {
None => return Err(Error::PropertyValueNotFound),
Some(canon) => canon,
};
CanonicalClassQuery::Script(canon)
}
_ => {
let vals = match property_values(canon_name)? {
None => return Err(Error::PropertyValueNotFound),
Some(vals) => vals,
};
let canon_val =
match canonical_value(vals, &property_value) {
None => {
return Err(Error::PropertyValueNotFound)
}
Some(canon_val) => canon_val,
};
CanonicalClassQuery::ByValue {
property_name: canon_name,
property_value: canon_val,
}
}
})
}
}
}
fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
let norm = symbolic_name_normalize(name);
if let Some(canon) = canonical_prop(&norm)? {
return Ok(CanonicalClassQuery::Binary(canon));
}
if let Some(canon) = canonical_gencat(&norm)? {
return Ok(CanonicalClassQuery::GeneralCategory(canon));
}
if let Some(canon) = canonical_script(&norm)? {
return Ok(CanonicalClassQuery::Script(canon));
}
Err(Error::PropertyNotFound)
}
}
#[derive(Debug, Eq, PartialEq)]
enum CanonicalClassQuery {
Binary(&'static str),
GeneralCategory(&'static str),
Script(&'static str),
ByValue {
property_name: &'static str,
property_value: &'static str,
},
}
pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> {
use self::CanonicalClassQuery::*;
match query.canonicalize()? {
Binary(name) => bool_property(name),
GeneralCategory(name) => gencat(name),
Script(name) => script(name),
ByValue { property_name: "Age", property_value } => {
let mut class = hir::ClassUnicode::empty();
for set in ages(property_value)? {
class.union(&hir_class(set));
}
Ok(class)
}
ByValue { property_name: "Script_Extensions", property_value } => {
script_extension(property_value)
}
ByValue {
property_name: "Grapheme_Cluster_Break",
property_value,
} => gcb(property_value),
ByValue { property_name: "Sentence_Break", property_value } => {
sb(property_value)
}
ByValue { property_name: "Word_Break", property_value } => {
wb(property_value)
}
_ => {
Err(Error::PropertyNotFound)
}
}
}
pub fn perl_word() -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-perl"))]
fn imp() -> Result<hir::ClassUnicode> {
Err(Error::PerlClassNotFound)
}
#[cfg(feature = "unicode-perl")]
fn imp() -> Result<hir::ClassUnicode> {
use unicode_tables::perl_word::PERL_WORD;
Ok(hir_class(PERL_WORD))
}
imp()
}
pub fn perl_space() -> Result<hir::ClassUnicode> {
#[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
fn imp() -> Result<hir::ClassUnicode> {
Err(Error::PerlClassNotFound)
}
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
fn imp() -> Result<hir::ClassUnicode> {
use unicode_tables::perl_space::WHITE_SPACE;
Ok(hir_class(WHITE_SPACE))
}
#[cfg(feature = "unicode-bool")]
fn imp() -> Result<hir::ClassUnicode> {
use unicode_tables::property_bool::WHITE_SPACE;
Ok(hir_class(WHITE_SPACE))
}
imp()
}
pub fn perl_digit() -> Result<hir::ClassUnicode> {
#[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
fn imp() -> Result<hir::ClassUnicode> {
Err(Error::PerlClassNotFound)
}
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
fn imp() -> Result<hir::ClassUnicode> {
use unicode_tables::perl_decimal::DECIMAL_NUMBER;
Ok(hir_class(DECIMAL_NUMBER))
}
#[cfg(feature = "unicode-gencat")]
fn imp() -> Result<hir::ClassUnicode> {
use unicode_tables::general_category::DECIMAL_NUMBER;
Ok(hir_class(DECIMAL_NUMBER))
}
imp()
}
pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
.iter()
.map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
.collect();
hir::ClassUnicode::new(hir_ranges)
}
pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
#[cfg(not(feature = "unicode-perl"))]
fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
Err(UnicodeWordError(()))
}
#[cfg(feature = "unicode-perl")]
fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
use is_word_byte;
use std::cmp::Ordering;
use unicode_tables::perl_word::PERL_WORD;
if c <= 0x7F as char && is_word_byte(c as u8) {
return Ok(true);
}
Ok(PERL_WORD
.binary_search_by(|&(start, end)| {
if start <= c && c <= end {
Ordering::Equal
} else if start > c {
Ordering::Greater
} else {
Ordering::Less
}
})
.is_ok())
}
imp(c)
}
type PropertyValues = &'static [(&'static str, &'static str)];
fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
Ok(match normalized_value {
"any" => Some("Any"),
"assigned" => Some("Assigned"),
"ascii" => Some("ASCII"),
_ => {
let gencats = property_values("General_Category")?.unwrap();
canonical_value(gencats, normalized_value)
}
})
}
fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
let scripts = property_values("Script")?.unwrap();
Ok(canonical_value(scripts, normalized_value))
}
fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
#[cfg(not(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
)))]
fn imp(_: &str) -> Result<Option<&'static str>> {
Err(Error::PropertyNotFound)
}
#[cfg(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
))]
fn imp(name: &str) -> Result<Option<&'static str>> {
use unicode_tables::property_names::PROPERTY_NAMES;
Ok(PROPERTY_NAMES
.binary_search_by_key(&name, |&(n, _)| n)
.ok()
.map(|i| PROPERTY_NAMES[i].1))
}
imp(normalized_name)
}
fn canonical_value(
vals: PropertyValues,
normalized_value: &str,
) -> Option<&'static str> {
vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
.ok()
.map(|i| vals[i].1)
}
fn property_values(
canonical_property_name: &'static str,
) -> Result<Option<PropertyValues>> {
#[cfg(not(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
)))]
fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
Err(Error::PropertyValueNotFound)
}
#[cfg(any(
feature = "unicode-age",
feature = "unicode-bool",
feature = "unicode-gencat",
feature = "unicode-perl",
feature = "unicode-script",
feature = "unicode-segment",
))]
fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
use unicode_tables::property_values::PROPERTY_VALUES;
Ok(PROPERTY_VALUES
.binary_search_by_key(&name, |&(n, _)| n)
.ok()
.map(|i| PROPERTY_VALUES[i].1))
}
imp(canonical_property_name)
}
#[allow(dead_code)]
fn property_set(
name_map: &'static [(&'static str, Range)],
canonical: &'static str,
) -> Option<Range> {
name_map
.binary_search_by_key(&canonical, |x| x.0)
.ok()
.map(|i| name_map[i].1)
}
fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
#[cfg(not(feature = "unicode-age"))]
fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
use std::option::IntoIter;
Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-age")]
fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
use unicode_tables::age;
const AGES: &'static [(&'static str, Range)] = &[
("V1_1", age::V1_1),
("V2_0", age::V2_0),
("V2_1", age::V2_1),
("V3_0", age::V3_0),
("V3_1", age::V3_1),
("V3_2", age::V3_2),
("V4_0", age::V4_0),
("V4_1", age::V4_1),
("V5_0", age::V5_0),
("V5_1", age::V5_1),
("V5_2", age::V5_2),
("V6_0", age::V6_0),
("V6_1", age::V6_1),
("V6_2", age::V6_2),
("V6_3", age::V6_3),
("V7_0", age::V7_0),
("V8_0", age::V8_0),
("V9_0", age::V9_0),
("V10_0", age::V10_0),
("V11_0", age::V11_0),
("V12_0", age::V12_0),
("V12_1", age::V12_1),
("V13_0", age::V13_0),
];
assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
match pos {
None => Err(Error::PropertyValueNotFound),
Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)),
}
}
imp(canonical_age)
}
fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-gencat"))]
fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
Err(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-gencat")]
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
use unicode_tables::general_category::BY_NAME;
match name {
"ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
"Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
"Assigned" => {
let mut cls = gencat("Unassigned")?;
cls.negate();
Ok(cls)
}
name => property_set(BY_NAME, name)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound),
}
}
match canonical_name {
"Decimal_Number" => perl_digit(),
name => imp(name),
}
}
fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-script"))]
fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
Err(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-script")]
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
use unicode_tables::script::BY_NAME;
property_set(BY_NAME, name)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
imp(canonical_name)
}
fn script_extension(
canonical_name: &'static str,
) -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-script"))]
fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
Err(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-script")]
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
use unicode_tables::script_extension::BY_NAME;
property_set(BY_NAME, name)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
imp(canonical_name)
}
fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-bool"))]
fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
Err(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-bool")]
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
use unicode_tables::property_bool::BY_NAME;
property_set(BY_NAME, name)
.map(hir_class)
.ok_or(Error::PropertyNotFound)
}
match canonical_name {
"Decimal_Number" => perl_digit(),
"White_Space" => perl_space(),
name => imp(name),
}
}
fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-segment"))]
fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
Err(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-segment")]
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
use unicode_tables::grapheme_cluster_break::BY_NAME;
property_set(BY_NAME, name)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
imp(canonical_name)
}
fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-segment"))]
fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
Err(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-segment")]
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
use unicode_tables::word_break::BY_NAME;
property_set(BY_NAME, name)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
imp(canonical_name)
}
fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
#[cfg(not(feature = "unicode-segment"))]
fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
Err(Error::PropertyNotFound)
}
#[cfg(feature = "unicode-segment")]
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
use unicode_tables::sentence_break::BY_NAME;
property_set(BY_NAME, name)
.map(hir_class)
.ok_or(Error::PropertyValueNotFound)
}
imp(canonical_name)
}
fn symbolic_name_normalize(x: &str) -> String {
let mut tmp = x.as_bytes().to_vec();
let len = symbolic_name_normalize_bytes(&mut tmp).len();
tmp.truncate(len);
String::from_utf8(tmp).unwrap()
}
fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
let mut start = 0;
let mut starts_with_is = false;
if slice.len() >= 2 {
starts_with_is = slice[0..2] == b"is"[..]
|| slice[0..2] == b"IS"[..]
|| slice[0..2] == b"iS"[..]
|| slice[0..2] == b"Is"[..];
if starts_with_is {
start = 2;
}
}
let mut next_write = 0;
for i in start..slice.len() {
let b = slice[i];
if b == b' ' || b == b'_' || b == b'-' {
continue;
} else if b'A' <= b && b <= b'Z' {
slice[next_write] = b + (b'a' - b'A');
next_write += 1;
} else if b <= 0x7F {
slice[next_write] = b;
next_write += 1;
}
}
if starts_with_is && next_write == 1 && slice[0] == b'c' {
slice[0] = b'i';
slice[1] = b's';
slice[2] = b'c';
next_write = 3;
}
&mut slice[..next_write]
}
#[cfg(test)]
mod tests {
use super::{
contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
symbolic_name_normalize_bytes,
};
#[cfg(feature = "unicode-case")]
fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
simple_fold(c).unwrap().unwrap()
}
#[cfg(feature = "unicode-case")]
fn simple_fold_err(c: char) -> Option<char> {
match simple_fold(c).unwrap() {
Ok(_) => unreachable!("simple_fold returned Ok iterator"),
Err(next) => next,
}
}
#[cfg(feature = "unicode-case")]
fn contains_case_map(start: char, end: char) -> bool {
contains_simple_case_mapping(start, end).unwrap()
}
#[test]
#[cfg(feature = "unicode-case")]
fn simple_fold_k() {
let xs: Vec<char> = simple_fold_ok('k').collect();
assert_eq!(xs, vec!['K', 'K']);
let xs: Vec<char> = simple_fold_ok('K').collect();
assert_eq!(xs, vec!['k', 'K']);
let xs: Vec<char> = simple_fold_ok('K').collect();
assert_eq!(xs, vec!['K', 'k']);
}
#[test]
#[cfg(feature = "unicode-case")]
fn simple_fold_a() {
let xs: Vec<char> = simple_fold_ok('a').collect();
assert_eq!(xs, vec!['A']);
let xs: Vec<char> = simple_fold_ok('A').collect();
assert_eq!(xs, vec!['a']);
}
#[test]
#[cfg(feature = "unicode-case")]
fn simple_fold_empty() {
assert_eq!(Some('A'), simple_fold_err('?'));
assert_eq!(Some('A'), simple_fold_err('@'));
assert_eq!(Some('a'), simple_fold_err('['));
assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
}
#[test]
#[cfg(feature = "unicode-case")]
fn simple_fold_max() {
assert_eq!(None, simple_fold_err('\u{10FFFE}'));
assert_eq!(None, simple_fold_err('\u{10FFFF}'));
}
#[test]
#[cfg(not(feature = "unicode-case"))]
fn simple_fold_disabled() {
assert!(simple_fold('a').is_err());
}
#[test]
#[cfg(feature = "unicode-case")]
fn range_contains() {
assert!(contains_case_map('A', 'A'));
assert!(contains_case_map('Z', 'Z'));
assert!(contains_case_map('A', 'Z'));
assert!(contains_case_map('@', 'A'));
assert!(contains_case_map('Z', '['));
assert!(contains_case_map('☃', 'Ⰰ'));
assert!(!contains_case_map('[', '['));
assert!(!contains_case_map('[', '`'));
assert!(!contains_case_map('☃', '☃'));
}
#[test]
#[cfg(not(feature = "unicode-case"))]
fn range_contains_disabled() {
assert!(contains_simple_case_mapping('a', 'a').is_err());
}
#[test]
#[cfg(feature = "unicode-gencat")]
fn regression_466() {
use super::{CanonicalClassQuery, ClassQuery};
let q = ClassQuery::OneLetter('C');
assert_eq!(
q.canonicalize().unwrap(),
CanonicalClassQuery::GeneralCategory("Other")
);
}
#[test]
fn sym_normalize() {
let sym_norm = symbolic_name_normalize;
assert_eq!(sym_norm("Line_Break"), "linebreak");
assert_eq!(sym_norm("Line-break"), "linebreak");
assert_eq!(sym_norm("linebreak"), "linebreak");
assert_eq!(sym_norm("BA"), "ba");
assert_eq!(sym_norm("ba"), "ba");
assert_eq!(sym_norm("Greek"), "greek");
assert_eq!(sym_norm("isGreek"), "greek");
assert_eq!(sym_norm("IS_Greek"), "greek");
assert_eq!(sym_norm("isc"), "isc");
assert_eq!(sym_norm("is c"), "isc");
assert_eq!(sym_norm("is_c"), "isc");
}
#[test]
fn valid_utf8_symbolic() {
let mut x = b"abc\xFFxyz".to_vec();
let y = symbolic_name_normalize_bytes(&mut x);
assert_eq!(y, b"abcxyz");
}
}