mastodon/config/initializers/twitter_regex.rb

module Twitter
  class Regex
    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
    REGEXEN[:valid_url_balanced_parens] = /
      \(
        (?:
          #{REGEXEN[:valid_general_url_path_chars]}+
          |
          # allow one nested level of balanced parentheses
          (?:
            #{REGEXEN[:valid_general_url_path_chars]}*
            \(
              #{REGEXEN[:valid_general_url_path_chars]}+
            \)
            #{REGEXEN[:valid_general_url_path_chars]}*
          )
        )
      \)
    /iox
    REGEXEN[:valid_url_path] = /(?:
      (?:
        #{REGEXEN[:valid_general_url_path_chars]}*
        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
        #{REGEXEN[:valid_url_path_ending_chars]}
      )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
    )/iox
    REGEXEN[:valid_url] = %r{
      (                                                                                     #   $1 total match
        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceding character
        (                                                                                   #   $3 URL
          ((?:https?|dat|dweb|ipfs|ipns|ssb|gopher):\/\/)?                                  #   $4 Protocol (optional)
          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
          (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
        )
      )
    }iox
    REGEXEN[:validate_nodeid] = /(?:
      #{REGEXEN[:validate_url_unreserved]}|
      #{REGEXEN[:validate_url_pct_encoded]}|
      [!$()*+,;=]
    )/iox
    REGEXEN[:validate_resid] = /(?:
      #{REGEXEN[:validate_url_unreserved]}|
      #{REGEXEN[:validate_url_pct_encoded]}|
      #{REGEXEN[:validate_url_sub_delims]}
    )/iox
    REGEXEN[:valid_xmpp_uri] = %r{
      (                                                                                     #   $1 total match
        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceding character
        (                                                                                   #   $3 URL
          ((?:xmpp):)                                                                       #   $4 Protocol
          (//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)?                     #   $5 Authority (optional)
          (#{REGEXEN[:validate_nodeid]}+@)?                                                 #   $6 Username in path (optional)
          (#{REGEXEN[:valid_domain]})                                                       #   $7 Domain in path
          (/#{REGEXEN[:validate_resid]}+)?                                                  #   $8 Resource in path (optional)
          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $9 Query String
        )
      )
    }iox
  end

  module Extractor
    # Extracts a list of all XMPP URIs included in the Tweet <tt>text</tt> along
    # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
    # XMPP URIs an empty array will be returned.
    #
    # If a block is given then it will be called for each XMPP URI.
    def extract_xmpp_uris_with_indices(text, options = {}) # :yields: uri, start, end
      return [] unless text && text.index(":")
      urls = []

      text.to_s.scan(Twitter::Regex[:valid_xmpp_uri]) do
        valid_uri_match_data = $~

        start_position = valid_uri_match_data.char_begin(3)
        end_position = valid_uri_match_data.char_end(3)

        urls << {
          :url => valid_uri_match_data[3],
          :indices => [start_position, end_position]
        }
      end
      urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
      urls
    end
  end
end
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`module Twitter`
			`class Regex`
Fix URL linkifier grabbing full-width spaces and quotations (#9997) Fix #9993 Fix #5654 2019-02-09 19:13:11 +00:00			`REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou`
			`REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\\|@]\|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`REGEXEN[:valid_url_balanced_parens] = /`
			`\(`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}+`
			`\|`
			`# allow one nested level of balanced parentheses`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`\(`
			`#{REGEXEN[:valid_general_url_path_chars]}+`
			`\)`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`)`
			`)`
			`\)`
			`/iox`
			`REGEXEN[:valid_url_path] = /(?:`
			`(?:`
			`#{REGEXEN[:valid_general_url_path_chars]}*`
			`(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]})`
			`#{REGEXEN[:valid_url_path_ending_chars]}`
			`)\|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)`
			`)/iox`
			`REGEXEN[:valid_url] = %r{`
			`( # $1 total match`
Misc. typos (#8694) Found via `codespell -q 3 --skip="./app/javascript/mastodon/locales,./config/locales"` 2018-09-13 22:53:09 +00:00			`(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`( # $3 URL`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`((?:https?\|dat\|dweb\|ipfs\|ipns\|ssb\|gopher):\/\/)? # $4 Protocol (optional)`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`(#{REGEXEN[:valid_domain]}) # $5 Domain(s)`
			`(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)`
			`(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String`
			`)`
			`)`
			`}iox`
Add support for linking XMPP URIs in toots (#12709) * Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text 2020-01-11 01:15:25 +00:00			`REGEXEN[:validate_nodeid] = /(?:`
			`#{REGEXEN[:validate_url_unreserved]}\|`
			`#{REGEXEN[:validate_url_pct_encoded]}\|`
			`[!$()*+,;=]`
			`)/iox`
			`REGEXEN[:validate_resid] = /(?:`
			`#{REGEXEN[:validate_url_unreserved]}\|`
			`#{REGEXEN[:validate_url_pct_encoded]}\|`
			`#{REGEXEN[:validate_url_sub_delims]}`
			`)/iox`
			`REGEXEN[:valid_xmpp_uri] = %r{`
			`( # $1 total match`
			`(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character`
			`( # $3 URL`
			`((?:xmpp):) # $4 Protocol`
			`(//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # $5 Authority (optional)`
			`(#{REGEXEN[:validate_nodeid]}+@)? # $6 Username in path (optional)`
			`(#{REGEXEN[:valid_domain]}) # $7 Domain in path`
			`(/#{REGEXEN[:validate_resid]}+)? # $8 Resource in path (optional)`
			`(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $9 Query String`
			`)`
			`)`
			`}iox`
			`end`

			`module Extractor`
			`# Extracts a list of all XMPP URIs included in the Tweet <tt>text</tt> along`
			`# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no`
			`# XMPP URIs an empty array will be returned.`
			`#`
			`# If a block is given then it will be called for each XMPP URI.`
			`def extract_xmpp_uris_with_indices(text, options = {}) # :yields: uri, start, end`
			`return [] unless text && text.index(":")`
			`urls = []`

			`text.to_s.scan(Twitter::Regex[:valid_xmpp_uri]) do`
			`valid_uri_match_data = $~`

			`start_position = valid_uri_match_data.char_begin(3)`
			`end_position = valid_uri_match_data.char_end(3)`

			`urls << {`
			`:url => valid_uri_match_data[3],`
			`:indices => [start_position, end_position]`
			`}`
			`end`
			`urls.each{\|url\| yield url[:url], url[:indices].first, url[:indices].last} if block_given?`
			`urls`
			`end`
Enable to recognize most kinds of characters as URL paths (#4941) 2017-09-14 16:03:20 +00:00			`end`
			`end`