mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2025-01-03 14:38:55 -05:00
285 lines
8.9 KiB
Ragel
285 lines
8.9 KiB
Ragel
// Copyright (c) 2015 Couchbase, Inc.
|
||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||
// except in compliance with the License. You may obtain a copy of the License at
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||
// either express or implied. See the License for the specific language governing permissions
|
||
// and limitations under the License.
|
||
|
||
// +build BUILDTAGS
|
||
|
||
package segment
|
||
|
||
import (
|
||
"fmt"
|
||
"unicode/utf8"
|
||
)
|
||
|
||
var RagelFlags = "RAGELFLAGS"
|
||
|
||
var ParseError = fmt.Errorf("unicode word segmentation parse error")
|
||
|
||
// Word Types
|
||
const (
|
||
None = iota
|
||
Number
|
||
Letter
|
||
Kana
|
||
Ideo
|
||
)
|
||
|
||
%%{
|
||
machine s;
|
||
write data;
|
||
}%%
|
||
|
||
func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
|
||
cs, p, pe := 0, 0, len(data)
|
||
cap := maxTokens
|
||
if cap < 0 {
|
||
cap = 1000
|
||
}
|
||
if val == nil {
|
||
val = make([][]byte, 0, cap)
|
||
}
|
||
if types == nil {
|
||
types = make([]int, 0, cap)
|
||
}
|
||
|
||
// added for scanner
|
||
ts := 0
|
||
te := 0
|
||
act := 0
|
||
eof := pe
|
||
_ = ts // compiler not happy
|
||
_ = te
|
||
_ = act
|
||
|
||
// our state
|
||
startPos := 0
|
||
endPos := 0
|
||
totalConsumed := 0
|
||
%%{
|
||
|
||
include SCRIPTS "ragel/uscript.rl";
|
||
include WB "ragel/uwb.rl";
|
||
|
||
action startToken {
|
||
startPos = p
|
||
}
|
||
|
||
action endToken {
|
||
endPos = p
|
||
}
|
||
|
||
action finishNumericToken {
|
||
if !atEOF {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
|
||
val = append(val, data[startPos:endPos+1])
|
||
types = append(types, Number)
|
||
totalConsumed = endPos+1
|
||
if maxTokens > 0 && len(val) >= maxTokens {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
}
|
||
|
||
action finishHangulToken {
|
||
if endPos+1 == pe && !atEOF {
|
||
return val, types, totalConsumed, nil
|
||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
|
||
val = append(val, data[startPos:endPos+1])
|
||
types = append(types, Letter)
|
||
totalConsumed = endPos+1
|
||
if maxTokens > 0 && len(val) >= maxTokens {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
}
|
||
|
||
action finishKatakanaToken {
|
||
if endPos+1 == pe && !atEOF {
|
||
return val, types, totalConsumed, nil
|
||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
|
||
val = append(val, data[startPos:endPos+1])
|
||
types = append(types, Ideo)
|
||
totalConsumed = endPos+1
|
||
if maxTokens > 0 && len(val) >= maxTokens {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
}
|
||
|
||
action finishWordToken {
|
||
if !atEOF {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
val = append(val, data[startPos:endPos+1])
|
||
types = append(types, Letter)
|
||
totalConsumed = endPos+1
|
||
if maxTokens > 0 && len(val) >= maxTokens {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
}
|
||
|
||
action finishHanToken {
|
||
if endPos+1 == pe && !atEOF {
|
||
return val, types, totalConsumed, nil
|
||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
|
||
val = append(val, data[startPos:endPos+1])
|
||
types = append(types, Ideo)
|
||
totalConsumed = endPos+1
|
||
if maxTokens > 0 && len(val) >= maxTokens {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
}
|
||
|
||
action finishHiraganaToken {
|
||
if endPos+1 == pe && !atEOF {
|
||
return val, types, totalConsumed, nil
|
||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
|
||
val = append(val, data[startPos:endPos+1])
|
||
types = append(types, Ideo)
|
||
totalConsumed = endPos+1
|
||
if maxTokens > 0 && len(val) >= maxTokens {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
}
|
||
|
||
action finishNoneToken {
|
||
lastPos := startPos
|
||
for lastPos <= endPos {
|
||
_, size := utf8.DecodeRune(data[lastPos:])
|
||
lastPos += size
|
||
}
|
||
endPos = lastPos -1
|
||
p = endPos
|
||
|
||
if endPos+1 == pe && !atEOF {
|
||
return val, types, totalConsumed, nil
|
||
} else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
// otherwise, consume this as well
|
||
val = append(val, data[startPos:endPos+1])
|
||
types = append(types, None)
|
||
totalConsumed = endPos+1
|
||
if maxTokens > 0 && len(val) >= maxTokens {
|
||
return val, types, totalConsumed, nil
|
||
}
|
||
}
|
||
|
||
HangulEx = Hangul ( Extend | Format )*;
|
||
HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
|
||
NumericEx = Numeric ( Extend | Format )*;
|
||
KatakanaEx = Katakana ( Extend | Format )*;
|
||
MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
|
||
MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
|
||
ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
|
||
HanEx = Han ( Extend | Format )*;
|
||
HiraganaEx = Hiragana ( Extend | Format )*;
|
||
SingleQuoteEx = Single_Quote ( Extend | Format )*;
|
||
DoubleQuoteEx = Double_Quote ( Extend | Format )*;
|
||
HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
|
||
RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
|
||
NLCRLF = Newline | CR | LF;
|
||
OtherEx = ^(NLCRLF) ( Extend | Format )* ;
|
||
|
||
# UAX#29 WB8. Numeric × Numeric
|
||
# WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
|
||
# WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
|
||
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||
#
|
||
WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
|
||
|
||
# subset of the below for typing purposes only!
|
||
WordHangul = ( HangulEx )+ >startToken @endToken;
|
||
WordKatakana = ( KatakanaEx )+ >startToken @endToken;
|
||
|
||
# UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
|
||
# WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
|
||
# WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
|
||
# WB7a. Hebrew_Letter × Single_Quote
|
||
# WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
||
# WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
||
# WB9. (ALetter | Hebrew_Letter) × Numeric
|
||
# WB10. Numeric × (ALetter | Hebrew_Letter)
|
||
# WB13. Katakana × Katakana
|
||
# WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||
# WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||
#
|
||
# Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
|
||
#
|
||
Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
|
||
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
|
||
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
|
||
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
|
||
|ExtendNumLetEx
|
||
)+
|
||
)
|
||
(
|
||
( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
|
||
| ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
|
||
| NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
|
||
| HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
|
||
)+
|
||
)
|
||
)* ExtendNumLetEx*) >startToken @endToken;
|
||
|
||
# UAX#29 WB14. Any ÷ Any
|
||
WordHan = HanEx >startToken @endToken;
|
||
WordHiragana = HiraganaEx >startToken @endToken;
|
||
|
||
WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
|
||
|
||
WordCRLF = (CR LF) >startToken @endToken;
|
||
|
||
WordCR = CR >startToken @endToken;
|
||
|
||
WordLF = LF >startToken @endToken;
|
||
|
||
WordNL = Newline >startToken @endToken;
|
||
|
||
WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
|
||
|
||
Other = OtherEx >startToken @endToken;
|
||
|
||
main := |*
|
||
WordNumeric => finishNumericToken;
|
||
WordHangul => finishHangulToken;
|
||
WordKatakana => finishKatakanaToken;
|
||
Word => finishWordToken;
|
||
WordHan => finishHanToken;
|
||
WordHiragana => finishHiraganaToken;
|
||
WordRegional =>finishNoneToken;
|
||
WordCRLF => finishNoneToken;
|
||
WordCR => finishNoneToken;
|
||
WordLF => finishNoneToken;
|
||
WordNL => finishNoneToken;
|
||
WordExt => finishNoneToken;
|
||
Other => finishNoneToken;
|
||
*|;
|
||
|
||
write init;
|
||
write exec;
|
||
}%%
|
||
|
||
if cs < s_first_final {
|
||
return val, types, totalConsumed, ParseError
|
||
}
|
||
|
||
return val, types, totalConsumed, nil
|
||
}
|