(from github.com/nactac)
I’m trying to search WebDAV on apache.
Crawler succeed to get first page like a below.
I hope crawler appends child dir “BuildTools” and “src” to child urls, but didn’t so.
debug log
DEBUG Storing child urls: []
was shown.
<?xml version="1.0"?><?xml-stylesheet type="text/xsl" href="/svnindex.xsl"?>
<!DOCTYPE svn [
<!ELEMENT svn (index)>
<!ATTLIST svn version CDATA #REQUIRED
href CDATA #REQUIRED>
<!ELEMENT index (updir?, (file \| dir)*)>
<!ATTLIST index name CDATA #IMPLIED
path CDATA #IMPLIED
rev CDATA #IMPLIED
base CDATA #IMPLIED>
<!ELEMENT updir EMPTY>
<!ATTLIST updir href CDATA #REQUIRED>
<!ELEMENT file EMPTY>
<!ATTLIST file name CDATA #REQUIRED
href CDATA #REQUIRED>
<!ELEMENT dir EMPTY>
<!ATTLIST dir name CDATA #REQUIRED
href CDATA #REQUIRED>
]>
<svn version="1.8.10 (r1615264)"
href="http://subversion.apache.org/">
<index rev="39165" path="/Applications/FpScan" base="SPL-Series">
<updir href="../"/>
<dir name="BuildTools" href="BuildTools/" />
<file name="build.bat" href="build.bat" />
<file name="readme.txt" href="readme.txt" />
<dir name="src" href="src/" />
</index>
</svn>
It is because of this page is not html?
Is needed to parse after load svnindex.xsl?