Jump to content

User:Harej/citation-watchlist.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/* Per-wiki configuration */

const LANGUAGE = 'en';
const FAMILY = 'wikipedia';
const actionApiEndpoint = `https://${LANGUAGE}.${FAMILY}.org/w/api.php`;
const restApiEndpoint = `https://api.wikimedia.org/core/v1`;
const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";
const msgWarning = "Warning";
const msgCaution = "Caution";
const msgInspect = "Inspect";
const warnEmoji = '\u2757';
const cautionEmoji = '\u270B';
const inspectEmoji = '\uD83D\uDD0E';
const warnSectionHeader = "==Warn==";
const cautionSectionHeader = "==Caution==";
const inspectSectionHeader = "==Inspect==";
const delayMs = 50;
const maxRequestsPerHour = 400;


/*
Citation Watchlist Script – Highlights watchlist entries when questionable sources are added

author:  Hacks/Hackers
license: GPL 4.0
*/

let publicSuffixSet = new Set();
let warnList = new Set();
let cautionList = new Set();
let inspectList = new Set();
let lastRequestTime = 0;

// The Wikimedia REST API has a hard request limit of 500 per hour, and no clear
// way to batch these requests. As such, we need to track our requests, and to do
// so globally across the whole session (not just a single instantiation of the
// script.)

if (!localStorage.getItem('citationWatchlistRestApiRequestCount')) {
  localStorage.setItem('citationWatchlistRestApiRequestCount', '0');
}

setInterval(() => {
  localStorage.setItem('citationWatchlistRestApiRequestCount', '0');
  console.log("Request count reset");
}, 3600000);

function getRequestCount() {
  const count = parseInt(localStorage.getItem('citationWatchlistRestApiRequestCount'), 10);
  return isNaN(count) ? 0 : count;
}

function incrementRequestCount() {
  const currentCount = getRequestCount();
  localStorage.setItem('citationWatchlistRestApiRequestCount', (currentCount + 1).toString());
  console.log(`Request count incremented to ${currentCount + 1}`);
}

function prependEmojiWithTooltip(element, emoji, domains, tooltipText) {
  let processedType = '';
  if (emoji === warnEmoji) {
    processedType = 'warn';
  } else if (emoji === cautionEmoji) {
    processedType = 'caution';
  } else if (emoji === inspectEmoji) {
    processedType = 'inspect';
  } else {
    console.error('Unsupported emoji type');
    return;
  }

  if (element.getAttribute(`data-processed-${processedType}`) === 'true') {
    return;
  }

  const emojiSpan = document.createElement('span');
  emojiSpan.textContent = emoji + " ";
  emojiSpan.title = tooltipText + ": " + domains.join(", ");
  element.parentNode.insertBefore(emojiSpan, element);
  element.setAttribute(`data-processed-${processedType}`, 'true');
}

async function parseWatchlist() {
  // Select all containers of the watchlist links to process them individually
  const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
  const revisions = [];
  const revisionIds = [];

  let linkCounter = 0;

  // Build map of previous revision IDs
  for (const container of entriesContainers) {
    const prevLink = container.querySelector('a.mw-history-histlinks-previous');
    let urlParams = '';
    if (prevLink) {
      urlParams = new URLSearchParams(prevLink.href);
      revisionIds.push(urlParams.get('oldid'));
    }
  }
  console.log(revisionIds);
  const previousRevisionMap = await fetchPreviousRevisionIds(revisionIds);

  for (const container of entriesContainers) {
    const diffLink = container.querySelector('a.mw-changeslist-diff');
    const histLink = container.querySelector('a.mw-changeslist-history');
    const prevLink = container.querySelector('a.mw-history-histlinks-previous');
    const curLink = container.querySelector('a.mw-history-histlinks-current');

    if (diffLink) {
      // First we are checking if we are in recent changes / watchlist.
      // If a "diff" link is found, process it
      linkCounter += 1;
      urlParams = new URLSearchParams(diffLink.href);
      revisions.push({
        oldrevision: urlParams.get('diff'),
        newrevision: urlParams.get('oldid'),
        element: diffLink.parentNode.parentNode
      });
    } else if (histLink) {
      // If no "diff" link is found but a "hist" link is, process the "hist" link
      linkCounter += 1;
      urlParams = new URLSearchParams(histLink.href);
      const pageID = urlParams.get('curid');
      const firstID = await fetchFirstRevisionId(pageID);
      revisions.push({
        oldrevision: firstID,
        element: histLink.parentNode.parentNode
      });
    } else if (prevLink) {
      // At this point, check if we are on a page history rather than watchlist
      linkCounter += 1;
      urlParams = new URLSearchParams(prevLink.href);
      revisions.push({
        oldrevision: urlParams.get('oldid'),
        newrevision: previousRevisionMap[urlParams.get('oldid')],
        element: prevLink.parentNode.parentNode
      });
    } else if (curLink) {
      // No prev link means we are at the page's first revision
      // We do not actually want to compare to the current revision. We extract
      // the oldid and treat like a new page.
      linkCounter += 1;
      urlParams = new URLSearchParams(curLink.href);
      revisions.push({
        oldrevision: urlParams.get('oldid'),
        element: curLink.parentNode.parentNode
      });
    }
  }

  // Finally, to get to this point, you are on a page history with only
  // one revision, and therefore no links of any kind. Extract first (and
  // only) revision ID from page title.
  if (linkCounter == 0) {
    const pageID = mw.config.get('wgArticleId');
    const firstID = await fetchFirstRevisionId(pageID);
    revisions.push({
      oldrevision: firstID,
      element: entriesContainers[0]
    });
  }

  return revisions;
}

function delay(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

async function buildURL(params) {
  const url = new URL(actionApiEndpoint);
  Object.keys(params).forEach(key => url.searchParams.append(key, params[key]));
  return url;
}

function getRootDomain(hostname, publicSuffixSet) {
  const domainParts = hostname.split('.');
  for (let i = 0; i < domainParts.length; i++) {
    const candidate = domainParts.slice(i).join('.');
    if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) {
      return domainParts.slice(i - 1).join('.');
    }
  }
  return hostname;
}

function extractAddedURLs(addedParts) {
  const addedURLs = [];
  const urlRegex = /https?:\/\/[^\s<"]+/g;
  let match;
  while ((match = urlRegex.exec(addedParts)) !== null) {
    try {
      const url = new URL(match[0]);
      addedURLs.push(url.href);
    } catch (error) {
      console.error(`Invalid URL rejected: ${match[0]}`);
    }
  }
  return addedURLs;
}

async function fetchFromActionAPI(params) {
  const url = await buildURL(params);
  console.log(`Action API request: ${url}`);

  const now = Date.now();
  const elapsed = now - lastRequestTime;
  if (elapsed < delayMs) {
    await delay(delayMs - elapsed);
  }

  lastRequestTime = Date.now();

  try {
    const response = await fetch(url);
    if (!response.ok) {
      throw new Error(`Network response was not ok: ${response.statusText}`);
    }
    return await response.json();
  } catch (error) {
    console.error('Error fetching data from MediaWiki API:', error);
    throw error;
  }
}

async function fetchPublicSuffixList() {
  const pslUrl = `https://${LANGUAGE}.${FAMILY}.org/wiki/${publicSuffixList}?action=raw`;
  console.log(`Raw page text request: ${pslUrl}`);
  try {
    const response = await fetch(pslUrl);
    const content = await response.text();
    const suffixSet = new Set();
    const lines = content.split('\n');
    for (const line of lines) {
      if (line.trim() && !line.trim().startsWith('//')) {
        suffixSet.add(line.trim());
      }
    }
    return suffixSet;
  } catch (error) {
    console.error("Error fetching Public Suffix List:", error);
    return new Set();
  }
}

async function fetchDiffFromAPI(apiUrl) {
  if (getRequestCount() >= maxRequestsPerHour) {
    console.warn("Request limit reached, waiting for reset...");
    await delay(3600000); // Wait for an hour if the limit is reached
  }

  incrementRequestCount();
  console.log(`Diff API request: ${apiUrl} (Request count: ${getRequestCount()})`);
  try {
    const response = await fetch(apiUrl);
    const data = await response.json();
    return data["source"] || data["diff"];
  } catch (error) {
    console.error('Error fetching API content:', error);
    return null;
  }
}

async function fetchDiffAndProcess(revisions) {
  for (const revision of revisions) {
    let apiUrl = `${restApiEndpoint}/${FAMILY}/${LANGUAGE}/revision/${revision.oldrevision}`;
    if (revision.newrevision !== undefined) {
      apiUrl += `/compare/${revision.newrevision}`;
    }
    const diff = await fetchDiffFromAPI(apiUrl);
    let addedURLs = [];

    if (Array.isArray(diff)) { // actual diffs are arrays; new pages are strings
      // Types 2 and 4 represent "from".
      // Types 1 and 5 represent "to".
      // Type 3 represents changes within a line. It will be harder to extract URL changes in this case.
      let fromURLs = [];
      let toURLs = [];

      for (const diffLine of diff) {
        const lineURLs = extractAddedURLs(diffLine.text);
        for (const URL of lineURLs) {
          if (diffLine.type === 2 || diffLine.type === 4) {
            fromURLs.push(URL);
          } else if (diffLine.type === 1 || diffLine.type === 5) {
            toURLs.push(URL);
          }
        }
      }

      const toURLSet = new Set(toURLs);
      addedURLs = fromURLs.filter(url => !toURLSet.has(url));
    } else {
      addedURLs = extractAddedURLs(diff);
    }

    console.log(`Old revision: ${revision.oldrevision}
    New revision: ${revision.newrevision}
    API URL: ${apiUrl}
    Revision element: ${revision.element.innerHTML}
    Added URLs: ${addedURLs.join(' ')}
    `);

    const matchedWarnDomains = [];
    const matchedCautionDomains = [];
    const matchedInspectDomains = [];

    for (const url of addedURLs) {
      const hostname = new URL(url).hostname;
      const domain = getRootDomain(hostname, publicSuffixSet);

      if (warnList.has(domain) && !matchedWarnDomains.includes(domain)) {
        matchedWarnDomains.push(domain);
      } else if (cautionList.has(domain) && !matchedCautionDomains.includes(domain)) {
        matchedCautionDomains.push(domain);
      } else if (inspectList.has(domain) && !matchedInspectDomains.includes(domain)) {
        matchedInspectDomains.push(domain);
      }
    }

    if (matchedWarnDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, warnEmoji, matchedWarnDomains, msgWarning);
    }
    if (matchedCautionDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, cautionEmoji, matchedCautionDomains, msgCaution);
    }
    if (matchedInspectDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, inspectEmoji, matchedInspectDomains, msgInspect);
    }
  }
}

async function fetchAndOrganizeDomainLists(pageNames) {
  const params = {
    action: 'query',
    prop: 'revisions',
    titles: pageNames.join('|'), // Join all page names
    rvprop: 'content',
    rvslots: '*',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    
    const warnList = new Set();
    const cautionList = new Set();
    const inspectList = new Set();

    for (const pageId in pages) {
      const content = pages[pageId].revisions[0].slots.main['*'];
      let currentList = null;

      const lines = content.split('\n');
      for (let line of lines) {
        if (line.trim() === warnSectionHeader) {
          currentList = warnList;
        } else if (line.trim() === cautionSectionHeader) {
          currentList = cautionList;
        } else if (line.trim() === inspectSectionHeader) {
          currentList = inspectList;
        }

        if (line.startsWith('*') && currentList) {
          const domain = line.substring(1).trim();
          currentList.add(domain);
        }
      }
    }

    return {
      warnList,
      cautionList,
      inspectList
    };
  } catch (error) {
    console.error('Error fetching or parsing the page content:', error);
    throw error;
  }
}

async function fetchPreviousRevisionIds(revisionIds) {
  const params = {
    action: 'query',
    prop: 'revisions',
    revids: revisionIds.join('|'), // join all revision IDs
    rvprop: 'ids',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const revisionMap = {};
    for (const pageId in pages) {
      const revisions = pages[pageId].revisions;
      if (revisions && revisions.length > 0) {
        for (const revision of revisions) {
          revisionMap[revision.revid] = revision.parentid;
        }
      }
    }
    return revisionMap;
  } catch (error) {
    console.error('Error fetching previous revision IDs:', error);
    return {};
  }
}

async function fetchFirstRevisionId(pageID) {
  const params = {
    action: 'query',
    pageids: pageID,
    prop: 'revisions',
    rvlimit: 1,
    rvdir: 'newer',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const pageId = Object.keys(pages)[0];
    const revisions = pages[pageId].revisions;

    if (revisions && revisions.length > 0) {
      return revisions[0].revid;
    } else {
      throw new Error('No revisions found for this page.');
    }
  } catch (error) {
    console.error('Error fetching first revision ID:', error);
    return null;
  }
}

async function fetchDomainListPages(pageName) {
  const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`;
  const cacheExpiration = 4 * 60 * 60 * 1000; // 4 hours in milliseconds
  const now = Date.now();
  const cachedData = localStorage.getItem(cacheKey);
  const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`);
  if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) < cacheExpiration) {
    console.log("Loaded list of lists from cache");
    return JSON.parse(cachedData);
  } else {
    const params = {
      action: 'query',
      prop: 'revisions',
      titles: pageName,
      rvprop: 'content',
      rvslots: '*',
      format: 'json',
      origin: '*'
    };
    try {
      const data = await fetchFromActionAPI(params);
      const page = data.query.pages;
      const pageId = Object.keys(page)[0];
      const content = page[pageId].revisions[0].slots.main['*'];
      const pageTitles = [];
      const lines = content.split('\n');
      for (let line of lines) {
        if (line.startsWith('* [[')) {
          const match = line.match(/\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
          if (match) {
            pageTitles.push(match[1]);
          }
        }
      }
      localStorage.setItem(cacheKey, JSON.stringify(pageTitles));
      localStorage.setItem(`${cacheKey}_timestamp`, now.toString());
      console.log("Loaded from API and stored in cache");
      return pageTitles;
    } catch (error) {
      console.error('Error fetching or parsing the page content:', error);
      throw error;
    }
  }
}

async function runScript() {
  publicSuffixSet = await fetchPublicSuffixList();
  if (publicSuffixSet.size === 0) {
    console.error('Public Suffix List loading failed');
    return;
  }
  console.log("Welcome to Citation Watchlist");
  const listPages = await fetchDomainListPages(listOfLists);
  try {
    const lists = await fetchAndOrganizeDomainLists(listPages);
    lists.warnList.forEach(warnList.add, warnList);
    lists.cautionList.forEach(cautionList.add, cautionList);
    lists.inspectList.forEach(inspectList.add, inspectList);
  } catch (error) {
    console.error('Error fetching domain lists:', error);
  }
  const watchlistRevisions = await parseWatchlist();
  await fetchDiffAndProcess(watchlistRevisions);
}

runScript().then(() => console.log('Citation Watchlist script finished executing'));