package org.archive.modules.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.archive.modules.CrawlURI;
import org.archive.modules.fetcher.FetchHTTP;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorUniversal.class */
public class ExtractorUniversal extends ContentExtractor {
    private static final long serialVersionUID = 3;
    protected static final Pattern IP_ADDRESS = Pattern.compile("((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)");
    public static final Pattern TLDs = Pattern.compile("(ac(/.*)?)|(ad(/.*)?)|(ae(/.*)?)|(af(/.*)?)|(ag(/.*)?)|(ai(/.*)?)|(al(/.*)?)|(am(/.*)?)|(an(/.*)?)|(ao(/.*)?)|(aero(/.*)?)|(aq(/.*)?)|(ar(/.*)?)|(as(/.*)?)|(at(/.*)?)|(au(/.*)?)|(aw(/.*)?)|(az(/.*)?)|(ba(/.*)?)|(bb(/.*)?)|(bd(/.*)?)|(be(/.*)?)|(bf(/.*)?)|(bg(/.*)?)|(bh(/.*)?)|(bi(/.*)?)|(biz(/.*)?)|(bj(/.*)?)|(bm(/.*)?)|(bn(/.*)?)|(bo(/.*)?)|(br(/.*)?)|(bs(/.*)?)|(bt(/.*)?)|(bv(/.*)?)|(bw(/.*)?)|(by(/.*)?)|(bz(/.*)?)|(ca(/.*)?)|(cc(/.*)?)|(cd(/.*)?)|(cf(/.*)?)|(cg(/.*)?)|(ch(/.*)?)|(ci(/.*)?)|(ck(/.*)?)|(cl(/.*)?)|(cm(/.*)?)|(cn(/.*)?)|(co(/.*)?)|(com(/.*)?)|(coop(/.*)?)|(cr(/.*)?)|(cs(/.*)?)|(cu(/.*)?)|(cv(/.*)?)|(cx(/.*)?)|(cy(/.*)?)|(cz(/.*)?)|(de(/.*)?)|(dj(/.*)?)|(dk(/.*)?)|(dm(/.*)?)|(do(/.*)?)|(dz(/.*)?)|(ec(/.*)?)|(edu(/.*)?)|(ee(/.*)?)|(eg(/.*)?)|(eh(/.*)?)|(er(/.*)?)|(es(/.*)?)|(et(/.*)?)|(fi(/.*)?)|(fj(/.*)?)|(fk(/.*)?)|(fm(/.*)?)|(fo(/.*)?)|(fr(/.*)?)|(ga(/.*)?)|(gd(/.*)?)|(ge(/.*)?)|(gf(/.*)?)|(gg(/.*)?)|(gh(/.*)?)|(gi(/.*)?)|(gl(/.*)?)|(gm(/.*)?)|(gn(/.*)?)|(gov(/.*)?)|(gp(/.*)?)|(gq(/.*)?)|(gr(/.*)?)|(gs(/.*)?)|(gt(/.*)?)|(gu(/.*)?)|(gw(/.*)?)|(gy(/.*)?)|(hk(/.*)?)|(hm(/.*)?)|(hn(/.*)?)|(hr(/.*)?)|(ht(/.*)?)|(hu(/.*)?)|(id(/.*)?)|(ie(/.*)?)|(il(/.*)?)|(im(/.*)?)|(in(/.*)?)|(info(/.*)?)|(int(/.*)?)|(io(/.*)?)|(iq(/.*)?)|(ir(/.*)?)|(is(/.*)?)|(it(/.*)?)|(je(/.*)?)|(jm(/.*)?)|(jo(/.*)?)|(jp(/.*)?)|(ke(/.*)?)|(kg(/.*)?)|(kh(/.*)?)|(ki(/.*)?)|(km(/.*)?)|(kn(/.*)?)|(kp(/.*)?)|(kr(/.*)?)|(kw(/.*)?)|(ky(/.*)?)|(kz(/.*)?)|(la(/.*)?)|(lb(/.*)?)|(lc(/.*)?)|(li(/.*)?)|(lk(/.*)?)|(lr(/.*)?)|(ls(/.*)?)|(lt(/.*)?)|(lu(/.*)?)|(lv(/.*)?)|(ly(/.*)?)|(ma(/.*)?)|(mc(/.*)?)|(md(/.*)?)|(mg(/.*)?)|(mh(/.*)?)|(mil(/.*)?)|(mk(/.*)?)|(ml(/.*)?)|(mm(/.*)?)|(mn(/.*)?)|(mo(/.*)?)|(mp(/.*)?)|(mq(/.*)?)|(mr(/.*)?)|(ms(/.*)?)|(mt(/.*)?)|(mu(/.*)?)|(museum(/.*)?)|(mv(/.*)?)|(mw(/.*)?)|(mx(/.*)?)|(my(/.*)?)|(mz(/.*)?)|(na(/.*)?)|(name(/.*)?)|(nc(/.*)?)|(ne(/.*)?)|(net(/.*)?)|(nf(/.*)?)|(ng(/.*)?)|(ni(/.*)?)|(nl(/.*)?)|(no(/.*)?)|(np(/.*)?)|(nr(/.*)?)|(nt(/.*)?)|(nu(/.*)?)|(nz(/.*)?)|(om(/.*)?)|(org(/.*)?)|(pa(/.*)?)|(pe(/.*)?)|(pf(/.*)?)|(pg(/.*)?)|(ph(/.*)?)|(pk(/.*)?)|(pl(/.*)?)|(pm(/.*)?)|(pn(/.*)?)|(pr(/.*)?)|(pro(/.*)?)|(ps(/.*)?)|(pt(/.*)?)|(pw(/.*)?)|(py(/.*)?)|(qa(/.*)?)|(re(/.*)?)|(ro(/.*)?)|(ru(/.*)?)|(rw(/.*)?)|(sa(/.*)?)|(sb(/.*)?)|(sc(/.*)?)|(sd(/.*)?)|(se(/.*)?)|(sg(/.*)?)|(sh(/.*)?)|(si(/.*)?)|(sj(/.*)?)|(sk(/.*)?)|(sl(/.*)?)|(sm(/.*)?)|(sn(/.*)?)|(so(/.*)?)|(sr(/.*)?)|(sv(/.*)?)|(st(/.*)?)|(sy(/.*)?)|(sz(/.*)?)|(tc(/.*)?)|(td(/.*)?)|(tf(/.*)?)|(tg(/.*)?)|(th(/.*)?)|(tj(/.*)?)|(tk(/.*)?)|(tm(/.*)?)|(tn(/.*)?)|(to(/.*)?)|(tp(/.*)?)|(tr(/.*)?)|(tt(/.*)?)|(tv(/.*)?)|(tw(/.*)?)|(tz(/.*)?)|(ua(/.*)?)|(ug(/.*)?)|(uk(/.*)?)|(um(/.*)?)|(us(/.*)?)|(uy(/.*)?)|(uz(/.*)?)|(va(/.*)?)|(vc(/.*)?)|(ve(/.*)?)|(vg(/.*)?)|(vi(/.*)?)|(vn(/.*)?)|(vu(/.*)?)|(wf(/.*)?)|(ws(/.*)?)|(ye(/.*)?)|(yt(/.*)?)|(yu(/.*)?)|(za(/.*)?)|(zm(/.*)?)|(zw(/.*)?)");

    public long getMaxSizeToParse() {
        return ((Long) this.kp.get("maxSizeToParse")).longValue();
    }

    public void setMaxSizeToParse(long j) {
        this.kp.put("maxSizeToParse", Long.valueOf(j));
    }

    public ExtractorUniversal() {
        setMaxSizeToParse(1048576L);
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean shouldExtract(CrawlURI crawlURI) {
        return true;
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r4v2, types: [org.archive.modules.extractor.LinkContext] */
    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean innerExtract(CrawlURI crawlURI) {
        long j;
        InputStream inputStream = null;
        try {
            try {
                inputStream = crawlURI.getRecorder().getContentReplayInputStream();
                StringBuffer stringBuffer = new StringBuffer();
                long j2 = 0;
                long maxSizeToParse = getMaxSizeToParse();
                if (maxSizeToParse <= 0) {
                    maxSizeToParse = Long.MAX_VALUE;
                }
                boolean z = false;
                for (int read = inputStream.read(); read != -1; read = inputStream.read()) {
                    long j3 = j2 + 1;
                    j2 = j;
                    if (j3 > maxSizeToParse) {
                        break;
                    }
                    if (stringBuffer.length() > 2083) {
                        stringBuffer = new StringBuffer();
                        z = false;
                    } else if (isURLableChar(read)) {
                        if (read == 46) {
                            z = true;
                        }
                        stringBuffer.append((char) read);
                    } else if (stringBuffer.length() > 3 && z) {
                        String stringBuffer2 = stringBuffer.toString();
                        j = j;
                        if (looksLikeAnURL(stringBuffer2)) {
                            if (stringBuffer2.toLowerCase().indexOf(FetchHTTP.HTTP_SCHEME) > 0) {
                                stringBuffer2 = stringBuffer2.substring(stringBuffer2.toLowerCase().indexOf(FetchHTTP.HTTP_SCHEME));
                            }
                            while (stringBuffer2.substring(stringBuffer2.length() - 1).equals(".")) {
                                stringBuffer2 = stringBuffer2.substring(0, stringBuffer2.length() - 1);
                            }
                            this.numberOfLinksExtracted.incrementAndGet();
                            UURI uuri = crawlURI.getUURI();
                            UURI uURIFactory = UURIFactory.getInstance(stringBuffer2);
                            ?? r4 = LinkContext.SPECULATIVE_MISC;
                            crawlURI.getOutLinks().add(new Link(uuri, uURIFactory, r4, Hop.SPECULATIVE));
                            j = r4;
                        }
                        stringBuffer = new StringBuffer();
                        z = false;
                    } else if (stringBuffer.length() > 0) {
                        stringBuffer = new StringBuffer();
                        z = false;
                    }
                }
                IOUtils.closeQuietly(inputStream);
                return true;
            } catch (IOException e) {
                crawlURI.getNonFatalFailures().add(e);
                IOUtils.closeQuietly(inputStream);
                return true;
            }
        } catch (Throwable th) {
            IOUtils.closeQuietly(inputStream);
            throw th;
        }
    }

    private boolean looksLikeAnURL(String str) {
        if ((str.indexOf("http://") == 0 || str.indexOf("https://") == 0) && IP_ADDRESS.matcher(str).matches()) {
            return true;
        }
        int indexOf = str.indexOf(".");
        if (indexOf == 0) {
            return false;
        }
        while (indexOf != -1 && indexOf < str.length()) {
            str = str.substring(indexOf + 1);
            if (isTLD(str.substring(0, str.length() <= 6 ? str.length() : 6))) {
                return true;
            }
            indexOf = str.indexOf(".");
        }
        return false;
    }

    private boolean isTLD(String str) {
        if (str.length() < 2) {
            return false;
        }
        return TLDs.matcher(str.toLowerCase()).matches();
    }

    private boolean isURLableChar(int i) {
        if (i >= 35 && i <= 38) {
            return true;
        }
        if ((i >= 43 && i <= 59) || i == 61) {
            return true;
        }
        if ((i < 63 || i > 90) && i != 95) {
            return (i >= 97 && i <= 122) || i == 126;
        }
        return true;
    }
}
