v3.4.5-branch #1
|
@ -214,39 +214,10 @@ class Formatter
|
|||
result.flatten.join
|
||||
end
|
||||
|
||||
UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/
|
||||
|
||||
def utf8_friendly_extractor(text, options = {})
|
||||
old_to_new_index = [0]
|
||||
|
||||
escaped = text.chars.map do |c|
|
||||
output = begin
|
||||
if c.ord.to_s(16).length > 2 && !UNICODE_ESCAPE_BLACKLIST_RE.match?(c)
|
||||
CGI.escape(c)
|
||||
else
|
||||
c
|
||||
end
|
||||
end
|
||||
|
||||
old_to_new_index << old_to_new_index.last + output.length
|
||||
|
||||
output
|
||||
end.join
|
||||
|
||||
# Note: I couldn't obtain list_slug with @user/list-name format
|
||||
# for mention so this requires additional check
|
||||
special = Extractor.extract_urls_with_indices(escaped, options).map do |extract|
|
||||
new_indices = [
|
||||
old_to_new_index.find_index(extract[:indices].first),
|
||||
old_to_new_index.find_index(extract[:indices].last),
|
||||
]
|
||||
|
||||
next extract.merge(
|
||||
indices: new_indices,
|
||||
url: text[new_indices.first..new_indices.last - 1]
|
||||
)
|
||||
end
|
||||
|
||||
special = Extractor.extract_urls_with_indices(text, options)
|
||||
standard = Extractor.extract_entities_with_indices(text, options)
|
||||
extra = Extractor.extract_extra_uris_with_indices(text, options)
|
||||
|
||||
|
|
|
@ -24,6 +24,10 @@ module Twitter::TwitterText
|
|||
)
|
||||
\)
|
||||
/iox
|
||||
REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou
|
||||
REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou
|
||||
REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou
|
||||
REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou
|
||||
REGEXEN[:valid_url_path] = /(?:
|
||||
(?:
|
||||
#{REGEXEN[:valid_general_url_path_chars]}*
|
||||
|
|
Loading…
Reference in a new issue