const escapeCharacterMatcher = '&amp;|&lt;|&gt;|&quot;|&#039;';

// Attempts to match a starting boundary character for a URL, for instances where we don't
// include HTTP or HTTPS that are easy to match as a starting point. Includes most special
// characters that might usually preceede a URL. It excludes things like letters and numbers
// so we don't pick URLs out of long strings of characters where they would be unexpected.
const urlStartMatcher = `(^|${escapeCharacterMatcher}|[-\\s()[\\]{}<>"'\`,.?!:;=+_\\^~#*|])`;

// Only matches HTTP or HTTPS, or no protocol which will be interpreted as HTTPS.
const urlProtocolMatcher = '(?:http:\\/\\/|https:\\/\\/)?';

// Matches any domain, but requires it to start with a letter or number. URLs domains technically
// don't need to start this way, but most common ones do, and it will provide a nicer UX if we can
// restrict that match.
const urlDomainMatcher = '(?:www\\.)?[a-zA-Z0-9][-a-zA-Z0-9@:%._+~#=]{1,255}';

// Includes a limited number of domain extensions to match so that we aren't matching any string
// that looks like "thing.thing" when there is no protocol, since many might not really be a URL.
const urlLimitedExtensionMatcher =
  '\\.(?:com|net|org|gov|edu|int|co|us|uk|io|ly)(?!\\.[-a-zA-Z0-9@:%._+~#=])';

// Matches any URL extension, we only use this when we have found a known protocol (i.e. HTTP(S)).
const urlAnyExtensionMatcher = '\\.[a-z]{2,63}(?!\\.[-a-zA-Z0-9@:%._+~#=])';

// Matches a URL path including query parameters. Uses a positive lookahead to only
// match a dot (.) as a part of the URL if it is NOT the last character, since it
// is common for people to end a URL with a period if they are typing out a sentence.
const urlPathBodyMatcher = '[-a-zA-Z0-9()@%_=+~#&/]|[.?](?=[a-zA-Z0-9()@%_=+~#&/])';
const urlEscapedPathMatcher = `\\b(?:${escapeCharacterMatcher}|${urlPathBodyMatcher})*`;
const urlUnescapedPathMatcher = `\\b(?:${urlPathBodyMatcher})*`;

// Responsible for finding the initial URL match taking escaped characters into consideration.
const urlEscapedMatcher = `${urlProtocolMatcher}${urlDomainMatcher}${urlAnyExtensionMatcher}${urlEscapedPathMatcher}`;
const urlEscapedRegex = new RegExp(`${urlStartMatcher}(${urlEscapedMatcher})`, 'g');

// Responsible for matching the final URL within the unescaped version of the initial match.
// There is a version for URLs with known protocols and one for those without known protocols.
const urlNoProtocolMatcher = `${urlDomainMatcher}${urlLimitedExtensionMatcher}${urlUnescapedPathMatcher}`;
const urlNoProtocolRegex = new RegExp(`(${urlNoProtocolMatcher})(.*)`);

const urlWithProtocolMatcher = `${urlProtocolMatcher}${urlDomainMatcher}${urlAnyExtensionMatcher}${urlUnescapedPathMatcher}`;
const urlWithProtocolRegex = new RegExp(`(${urlWithProtocolMatcher})(.*)`);

/**
 * According to OWASP (https://cheatsheetseries.owasp.org/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.html)
 * these are the five characters needed to safely escape HTML (&, <, >, ", ').
 */
const escapeHTML = (unsafe) => {
  return unsafe
    .replace(/&/g, '&amp;')
    .replace(/</g, '&lt;')
    .replace(/>/g, '&gt;')
    .replace(/"/g, '&quot;')
    .replace(/'/g, '&#039;');
};

const unescapeHTML = (safe) => {
  return safe
    .replace(/&gt;/g, '>')
    .replace(/&lt;/g, '<')
    .replace(/&amp;/g, '&')
    .replace(/&quot;/g, '"')
    .replace(/&#039;/g, "'");
};

const hasAllowedProtocol = (url) => url.startsWith('http://') || url.startsWith('https://');

// Has approximately the same functionality as: https://www.npmjs.com/package/linkifyjs
// except faster and smaller in size. Can probably be interchanged if it makes sense in the future.
const safelyLinkifyURLsInString = (string) => {
  if (!string) {
    return string;
  }

  const htmlEscapedText = escapeHTML(string);
  const replacedText = htmlEscapedText.replace(
    urlEscapedRegex,
    (match, leadingChars, escapedURL) => {
      // The URL we matched is just a "possible" URL. We need to unescape the match, so
      // we can confirm that by matching the URL by its real characters.
      const unescapedURL = unescapeHTML(escapedURL);

      // If there are parenthesis, only match the URL up to the last balanced parenthesis
      // so people can enclose a URL in parenthesis and have it work correctly.
      const parenInfo = findUnbalancedParenthesis(unescapedURL);

      // If we have a non-null result, we found an unbalanced parenthesis, so set the
      // working URL to just the balanced part, otherwise, use the full URL.
      const balancedParenURL = parenInfo?.balancedParenURL;
      const workingURL = balancedParenURL || unescapedURL;

      // If we have an allowed protocol, use a different regex that will allow us to
      // more freely match domain extensions.
      const urlUnescapedRegex = hasAllowedProtocol(unescapedURL)
        ? urlWithProtocolRegex
        : urlNoProtocolRegex;

      // Do another regex match within the unescaped version of the initial match.
      const urlMatches = workingURL.match(urlUnescapedRegex);
      if (!urlMatches || urlMatches.length === 0) {
        // If we don't find a URL within this match, just return the original.
        return match;
      }

      // Unpack groups into the URL we want, and any trailing characters.
      // Position zero is the full match, which we don't need.
      const [, url, trailingChars] = urlMatches;
      let protocolSafeURL = url;

      // If a URL doesn't specify an allowed protocol, assume HTTPS.
      if (!hasAllowedProtocol(url)) {
        protocolSafeURL = `https://${url}`;
      }

      // Re-escape the URL so it renders properly.
      const reescapedURL = escapeHTML(url);

      // Re-escape the trailing characters so they render safely.
      const allTrailingChars = (trailingChars || '') + (parenInfo?.unbalancedTrailingChars || '');
      const escapedTrailingChars = escapeHTML(allTrailingChars);

      // Capturing the leading characters seperately and inserting here prevents URLs from
      // being matched in the middle of long strings of characters.
      return `${leadingChars}<a href="${protocolSafeURL}" target="_blank" rel="noopener noreferrer">${reescapedURL}</a>${escapedTrailingChars}`;
    },
  );
  return replacedText;
};

const findUnbalancedParenthesis = (url) => {
  const trailingParenIndex = findIndexOfTrailingParenthesis(url);
  if (trailingParenIndex !== -1) {
    const balancedParenURL = url.substring(0, trailingParenIndex);
    const unbalancedTrailingChars = url.substring(trailingParenIndex);
    return { balancedParenURL, unbalancedTrailingChars };
  }
  return null;
};

const findIndexOfTrailingParenthesis = (string) => {
  let parenCount = 0;
  for (let index = 0; index < string.length; index++) {
    if (string[index] === '(') {
      parenCount++;
    } else if (string[index] === ')') {
      parenCount--;
    }
    if (parenCount < 0) {
      return index;
    }
  }
  return -1;
};

export { escapeHTML, unescapeHTML, safelyLinkifyURLsInString };
