" Test if ' in O'Hara works Matches may turn out to be buried in tags e.g. "row" in a
'Richard Bapty', // textbox 'dept' => '', 'flag' => 1, // 'uog' => '1', 'debug' => '0', 'anchor' => 'anchor', ); $formenu1 = "\n
\n"; $formenu2 = "
\n"; $debugmenu = array( // Example of spec for a mkmenu() menu // 'value' => 'text of line on menu' '2' => 'Verbose', '1' => 'medium', '0' => "Only emergencies." ); // can also just have list of values (no texts), and call // mkformmenu('varname', $menuarray, 1); $deptmenu = array('', 'computing', 'chemistry', 'dcs', 'psychology'); $uogmenu = array('1', '0'); $flagmenu = array('1', '0'); // x. Get the FORM variables set $url0 = 'http://www.psy.gla.ac.uk'; $formurl = $url0 . $PHP_SELF; //PHP_SELF => /~steve/tmp/decode.php // Look for simple request value with no var=val $reqstr = preg_replace('/&.*/', '', urldecode($QUERY_STRING)); if (preg_match('/=/', $reqstr)) { $reqstr = ''; }; $hasarg = 0; /// In case want to display differently if no user args yet. if (isset($findstr)) { $hasarg = 1; } else { $findstr = $vars['findstr']; if ($reqstr) { $findstr = $reqstr; $hasarg = 1; }; }; // set all defaults if not set by GET (user) foreach (array_keys($vars) as $varname) { if (!isset(${$varname})) ${$varname} = $vars[$varname]; }; /* obsolete manual settings // TEST //$debug = 1; //$start = 'http://www.gla.ac.uk/services/learningteaching/contactus/staffcontacts/'; //$dept = 'computing'; $name = 'Deneka donald'; $name = 'craig xxxxxxxx'; $name = 'quintin Cutts'; $name = 'Arthur Whittaker'; $name = 'james Currall'; $name = 'Richard Bapty'; $name = 'Rob Hill'; $name = 'asdfasdfadfasdf'; $name = 'zzzz'; //$dept = 'chemistry'; $flag = 1; $uog = 1; // END of Test settings */ // REQUEST ? ARGS EXPECTED ==================================== // $who or $name: Name to be found e.g. 'steve Draper' // $end: Short cut page to go to if available // $start: Search page to start with // $anchor: say what anchor tag in use and tell php that there is one // $flag: if present (any value) then output index page of everything //tried/found, instead of the best page itself. // $dept=chemistry If present, first search univ list of depts, match, //go to dept. page, search for "staff" hoping for staff list, // then search that for original names. // $uog: if present (any value) then print official person page. // ==================================== My PARAMETERS $defaultanchor = 'anchor'; $defaultfindstr = 'Draper'; $startlist = 'http://www.gla.ac.uk/services/learningteaching/contactus/staffcontacts/'; $uogurl = 'gla.ac.uk'; // NOT http: ... //$uogdb = 'http://www.gla.ac.uk:443/stafflist/surname.cfm?surname='; $uogdb = 'http://www.gla.ac.uk/stafflist/?action=search&surname='; $uogdb = 'http://www.gla.ac.uk/stafflist/index.html?action=search&surname='; // to be treated as a db, urls must end in = $uogdepts = 'http://www.gla.ac.uk/departments/'; $dcs = 'http://www.dcs.gla.ac.uk/contacts/search.cfm?position=everybody'; $defaultstart = $uogdb; $logfile = 'phpnames.html'; // or get this script name and s/.php/.html/ $logfileurl = $logfile; $endindex = "\n\n"; $startindex = "\n Index page for name search for \"TGT\" \n"; $linepat = '/]*>'; // Exclude '
' /* OBSOLETE INIT // ==================================== CHECK REQUEST VARS: process them. // NAME if ($name) $findstr = $name; elseif ($who) $findstr = $who; elseif ($QUERY_STRING) { if (! preg_match('/\&/', $QUERY_STRING)) { $findstr = $QUERY_STRING; $REQUEST_URI = preg_replace('/\?/', '?name=', $REQUEST_URI); // fix up REQUEST for later re-use in redirect re-call. //$_GET['name'] = $findstr; } }; //else $findstr = $defaultfindstr; //$startindex = preg_replace('/TGT/', $findstr, $startindex); */ $startindex = str_replace('TGT', $findstr, $startindex); if (! $findstr) { echo 'Need a find argument.

'; exit; } // actually should put up FORM box and ask for this arg. // END url if ($end && file_get_contents($end)) // short cut // Would save traffic to test header, but want to follow redirs { header('Location: ' . $end); exit; } // ANCHOR TAG if (! $anchor) { // This is cunning method of getting a #anchor into the browser, // given that PHP won't show any anchor in addr // AND that user may not have put one in anyway. $url = $REQUEST_URI; if (! preg_match('/\?/', $url)) { echo 'No request part (e.g. with name): ....php?name=Draper

'; exit; } $a = $defaultanchor; $frag = "&anchor=$a#$a"; $url = 'http://' . $SERVER_NAME . $url . $frag; echo "url: $url

"; header('Location: ' . $url); exit; }; // DEPT arg if ($dept) { // most work done at top of big loop $dept = trim($dept); $dp = 0; } // META // if $meta then google search for several staff contact pages // START URL if ($start) $url = $start; else $url = $defaultstart; // $flag, $uog are only 2 vars names same as params // Idea is: if in URL param, take that var, 1 as default; // If set manually, it's only default. // If neither, default effectively zero. // FLAG if (isset($_REQUEST{'flag'})) { $val = $_REQUEST{'flag'}; if (strlen($val) == 0) $val =1 ; $flag = $val; } // UOG if (isset($_REQUEST{'uog'})) { $val = $_REQUEST{'uog'}; if (strlen($val) == 0) $val =1 ; $uog = $val; } if ($uog) { $url = $uogdb; }; // SHOW EVERYTHING if (0) { print "_GET:
"; foreach ($_GET as $key => $val) { echo "$key => $val
"; }; print "_REQUEST:
"; foreach ($_REQUEST as $key => $val) { echo "$key => $val
"; }; print "_GLOBALS:
"; foreach ($GLOBALS as $key => $val) { echo "$key => $val
"; }; phpinfo(); exit; } // ==================================== FUNCTIONS function myprintr($a) { echo '

'; print_r($a); echo '
'; } function prline($line, $key) { $line = preg_replace(";
;is", '', $line); $line = preg_replace(";;is", '', $line); //
forces breaks return "
$key: $line
\n"; } function mklink($url) { $out = 'TGT'; $out = str_replace('TGT', $url, $out); return $out; } function google($domainurl) { global $findstr, $dept; $u = 'http://www.google.com/custom?'; $u = 'http://www.google.co.uk/custom?'; $words = $findstr; if ($dept) $words .= $dept; //$words .= ' home page'; $words = trim($words); $words = str_replace(' ', '+', $words); $u .= 'q=' . $words; //$u .= '&cof='; //#$uu = $uogurl; if ($depturl) $uu = $depturl; //$u .= '&domains=' . $domainurl; $u .= '&sitesearch=' . $domainurl; // needs a domain url like gla.ac.uk or chem.gla.ac.uk NOT http://www return $u; } function mkformmenu($varname, $valuearray, $islist, $suppressevent) { $out = ''; if ($islist) { // convert a simple list into val+text descrip pairs $arr = $valuearray; $valuearray = array(); foreach ($arr as $val) { $valuearray[$val] = trim($val); }; // this also incidentally drops duplicates }; global ${$varname}; $currentval = ${$varname}; $out .= ''; // only netscape needs the onkeypress to make CR submit $menu = "\n

Menu

" . $formenu1 . $textbox . "\n
" . chr(160) . 'Dept: ' . mkformmenu('dept', $deptmenu, 1, 1) . chr(160) . 'UoG: ' . mkformmenu('uog', $uogmenu, 1, 1) . chr(160) . 'Index vs. full page: ' . mkformmenu('flag', $flagmenu, 1, 1) . chr(160) . 'Debug: ' . mkformmenu('debug', $debugmenu, 0, 1) . chr(160) . "\n" . $formenu2 . $menu; } function myexit() { // used for error exits and showing index page. global $flag, $index, $startindex, $endindex, $menu; echo $startindex; mkmenu(); echo $menu; echo $index; echo $endindex; exit; } function fetchpage($url) { global $index, $linktext, $bufhash; //$index .= "trace fetch: ||$url||
"; //$url = preg_replace('|amp;|is', '', $url); // Now dealt with below in rel/abs URL conversion if (array_key_exists($url, $bufhash)) return $bufhash[$url]; //if ($bufhash[$url]) return $bufhash[$url]; $buf = file_get_contents($url); //$buf = file_get_contents(urldecode($url)); // Test HTTP response headers returned foreach ($http_response_header as $line) { if (preg_match('/^HTTP/is', $line)) { $aa[0] = $line; } elseif (preg_match('/^([a-z-]+):(.*)$/is', $line, $matches)) { $aa[$matches[1]] = $matches[2]; } else { $index .= "Bad line in http response header:: $line
"; }; }; if ($aa['Location']) { $url = $aa['Location']; }; $s = preg_replace("/^[^ ]*\s*/is", '', $aa[0]); $n = preg_replace("/ .*/is", '', $s); if ($n > 200) { $buf = ''; $index .= "HTTP Response: $s
"; }; //echo "http N: $n
"; if (strlen($buf) < 2000 && preg_match('/error/is', $buf)) { $index .= "Apparently an error page.
"; //$buf = ''; }; if (! $buf) { $u = mklink($url); $nn = strlen($buf); $index .= "Couldn't get page ($nn): $linktext $u
\n"; $bufhash[$url] = $buf; return ''; }; // Delete comments. Java? $buf = preg_replace('//s', '', $buf); // convert its URLs to absolute // ===== function ($url, $buf) --> $buf // Extract stems from page's own URL (assumed Abs) // ----Get true base for URL abs/rel conversions. // 1) has already converted URL from Headers (if redirected) // 2) should look in meta info too; // // 3) should look for $BASE in page here and use it if present // .... // if find this pattern then $url = $base if (preg_match(';.*?<]+).*?;si', $buf, $matches)) { $base = $matches[1]; $url = $base; $index .= "\n"; echo "\n"; }; // untested preg_match(';^.*\//[^/]+;', $url, $matches); $urlserver = $matches[0]; // also need protocol only; and rel. starts //abc... $urldir = preg_replace(';[^/]*$;', '', $url); $urlnoquery = preg_replace(';\?.*$;', '', $url); $urlnotag = preg_replace(';#.*$;', '', $url); $urlproto = preg_replace(';:.*$;', '', $url); // dir means server plus dir root // Should convert all new urls by DIY decode() // needed because of pages with & inside link URLs // browsers can deal with this, but fetch(), header() don't. // change this to use builtin htmlspecialchars_decode() // ALSO convert \ to / in URLs //$buf = preg_replace('/((HREF|SRC)\s*=\s*(\'|"|).*?&)amp;(.*?$3)/si', $buf = preg_replace('/((HREF|SRC)\s*=\s*(\'|"|)[^"\']+?\&)amp;/si', '$1', $buf); $count = 1; while ($count) { $buf = preg_replace('/((HREF|SRC)\s*=\s*(\'|"|)[^"\']*?)\\\/si', '$1' . '/', $buf, -1, $count); }; // This isn't good: need better pattern for all SRCs // Turn various types of Rel to Abs. if (0) { $pat = ';((?:HREF|SRC)\s*=\s*(\'|"|))(.*?)(\'|"|>|<);si'; preg_match_all($pat, $buf, $matches); echo "\n   URL:$url
\n"; foreach ($matches[0] as $a) { echo "$a
\n"; }; } // If new url begins with // then ... $buf = preg_replace(';((HREF|SRC)\s*=\s*(\'|"|))//;i', '$1' . "$urlproto:", $buf); // untested // If new url begins with / then ... $buf = preg_replace(';((HREF|SRC)\s*=\s*(\'|"|))/;i', '$1' . "$urlserver/", $buf); $buf = preg_replace(';((HREF|SRC)\s*=\s*(\'|"|))\?;i', '$1' . "$urlnoquery?", $buf); $buf = preg_replace(';((HREF|SRC)\s*=\s*(\'|"|))#;i', '$1' . "$urlnotag#", $buf); // Add here the other rel.URL conversion: anything except starting http: $pat = ';((?:HREF|SRC)\s*=\s*(?>\'|"|))(?![a-z]+:)(.*?)>;si'; $buf = preg_replace($pat, '$1' . $urldir . '$2', $buf); if (0) { $pat = ';((?:HREF|SRC)\s*=\s*(\'|"|))(.*?)(\'|"|>|<);si'; preg_match_all($pat, $buf, $matches); echo "\n   URL:$url
\n"; foreach ($matches[0] as $a) { echo "$a
\n"; }; } // ==== end function $bufhash[$url] = $buf; return $buf; } function mkpatterns($name) { if (! $name) { $out[] = ''; return $out; } $patterns[] = "/\b$name\b/"; $patterns[] = "/\b$name\b/i"; $patterns[] = "/\b$name/"; $patterns[] = "/\b$name/i"; $patterns[] = "/$name/"; $patterns[] = "/$name/i"; $len = strlen($name); for ($i = $len; $i >= ($len/2); $i--) { $s = substr($name, 0, $i); $patterns[] = "/\b$s/i"; }; return $patterns; } function pat2RE($pat) { // Strip pattern of slashes, leave in \b global $case; $case = ''; if (preg_match('/i$/', $pat)) $case = 'i'; $pat = preg_replace('|^/|', '', $pat); $pat = preg_replace('|/.*$|', '', $pat); return $pat; } function mkshortpatterns($pats) { $last = $out = ''; foreach ($pats as $p) { $p = pat2RE($p); //$p = preg_replace('|\Q\b|', '', $p); $p = str_replace('\b', '', $p); if (strcasecmp($last, $p) != 0) $last = $out[] = $p; }; return $out; } // ==================================== INIT PROCESSING // NAME INITS // extract surname // should loop over name strings here, fetching // create whole string $s = str_replace('%20', ' ', $findstr); // de-code //$s = preg_replace('|^ *|', '', $s); $s = preg_replace('/ *$/', '', $s); $s = trim($s); // chop sps front and rear $s2 = preg_replace('/.* /', '', $s); // delete up to space: save last name; if (! preg_match('/^[a-z]/i', $s2)) $s2 = ''; $surname = $surname0 = $s2; $fname = $fname0 = preg_replace('| .*|', '', $s); // delete from space: save first name; will lose middle names if (strpos($s, ' ' ) === false ) $fname = ''; $fpatterns0 = mkpatterns($fname); $spatterns0 = mkpatterns($surname); foreach ($spatterns as $key => $val) { echo "$key => $val
"; }; foreach ($fpatterns as $key => $val) { echo "$key => $val
"; }; //foreach (mkshortpatterns($spatterns) as $key => $val) { echo "$key => $val
"; }; // Other inits $loopnmb = 0; $linktext = 'Starting URL'; $index = $buf = ''; $menu = ''; $a = mklink($REQUEST_URI); $menu .= "\n
  • Original search call: $a"; $menu .= "\n
  • Starting search terms: $fname $surname"; $a = mklink($logfileurl); $menu .= "\n
  • Page with 'index' trace list of relevant pages: $a"; //$url3 = 'http://' . $SERVER_NAME . $SCRIPT_NAME . '?'; $url3 = $uogdb . $surname; //$a = $defaultanchor; $url3 .= "&uog=1&anchor=$a#$a"; $menu3 = "\n
  • " . 'Person\'s standard university web page: search results'; $menu3b = "\n
  • " . 'Person\'s own standard university web page'; $menu3 = str_replace('TGT', $url3, $menu3); // but this searches their name in db and gets list of pages, not their one page // LOOP 0 Inits ==================================== // ON ENTRY to each iteration: // $url set with new URL to process // $linktext ready: the text of link pointing to current page to be done // $buf empty; old page contents may be available // Loop bool is $url. // Should continue only while there is a new URL to develop. // Half way through loop, $url set to NUL. // Loop work is about finding another; or not. // Set $url (and $linefortag) if and when found. // These tested for and/or used later on. while ($url) { //while (1) $loopnmb++; if ($dept) { if ($debug) $index .= "loopnmb: $loopnmb
    \n"; if ($loopnmb == 1) { //$url = $uogdb . 'M'; $url = $uogdepts; $surname = $dept; $fname = ''; } elseif ($loopnmb == 2) { $surname = 'staff'; $fname = 'list'; $dept = ''; // next time round, have search names reinstalled. //} elseif ($loopnmb == 4) { // $surname = $dept; $fname = ''; } else $dept = $loopnmb = ''; // insurance $spatterns = mkpatterns($surname); $fpatterns = mkpatterns($fname); } elseif ($uog && $loopnmb == 2) { //echo "X: $url
    "; $menu3 = str_replace('TGT', $url, $menu3b); //$url = google($uogurl); //header('Location: ' . $url); exit; // shortcut. Need s/amp/ ? } else { // reset names to search $surname = $surname0; $fname = $fname0; $spatterns = $spatterns0; $fpatterns = $fpatterns0; } $u = mklink($url); $index .= "\n

    ===========================================
    \n"; $index .= "New loop begins: $linktext $u
    \n"; $index .= "===========================
    \n"; $buf = ''; if ($recursionguard[$url]) { $u = mklink($url); $index .= "$linktext $u Second call to url: fatal loop
    \n"; myexit(); } else $recursionguard[$url] = 1; // LOOP phase 1 FETCH CONTACTS PAGE ==================================== // LOOP phase 1A Non-db single page ============= if (! preg_match('/=$/', $url)) { $buf = fetchpage($url); } else { // LOOP phase 1B Db page ============= if (! $surname) { $index .= "Empty surname at phase 1B
    \n"; myexit(); // What if no surname? concat alphas // loop over a-z, fetch 26 pages, concat } $urlorig = $url; foreach (mkshortpatterns($spatterns) as $pat) { $url = $urlorig . $pat; $buf = fetchpage($url); if ($buf && !preg_match('/no matches were found for your search/', $buf)) break; // better if bad searches returned HTTP err.code from UoG db $u = mklink($url); $index .= "$linktext $u No match returned
    \n"; }; }; // end of fetch page. //$index .= $buf; // desparate trace: show page being searched. // LOOP phase 1-2 MID-LOOP HINGE AND RE-SET ============================ if (! $buf) { $index .= "Failed to fetch page (internal error?).
    \n"; myexit(); } $a = mklink($url); $menu4 = "\n

  • Direct URL to page below $a"; // nearly finished with url. Except for labelling current page. $alls = $linktexts = $urls = $matches = array(); // LOOP phase 2 SEARCH FOR NAME within page ========================== // ========2A Collect page properties ========================== //get put it in index. preg_match(';<(title|H\d)[^>]*>([^<]+);i', $buf, $matches); $title1 = $matches[2]; preg_match(';<(H\d)[^>]*>([^<]+);i', $buf, $matches); $title2 = $matches[2]; $hit = 0; if (preg_match("/$surname/i", $title1) || preg_match("/$fname/i", $title1) ) { $hit = 1; $ititle = $title1; } if (! $hit && (preg_match("/$surname/i", $title2) || preg_match("/$fname/i", $title2))) { $hit = 1; $ititle = $title2; } $u = mklink($url); if ($debug) $index .= "New page is:<BR>$linktext $u $ititle <BR>\n"; $depturl = ''; if (preg_match('/web:.*?<A[^>]*HREF="([^"]+)/is', $buf, $matches)) { $depturl = $matches[1]; if ($dept && $loopnmb == 2) { $url = $depturl; $linktext = 'Dept. web: pointer'; continue; } } // Insert <BR> at top and bot of <BODY> for later line-end search widening $buf = preg_replace('|<body[^>]*>|i', '$0' . ' <BR>', $buf, 1); // insert after $buf = preg_replace('|</body[^>]*>|i', '<BR> ' . '$0', $buf, 1); // insert before /BODY // now finished with url. Have labelled current page. $url = $linktext = $linefortag = ''; // preserve $buf ! // ========2B Search page for surname ========================== // Loop over patterns + the empty pattern // Match all of: surrounding "line"; link; pattern (possibly empty) // If matches: Copy all to index. Pick first for future. // If none, continue loop. // If not hit by end, error exit // want to match whole line, so firstname search can be within that (e.g. dept) // Resulting Match order wanted: // Order of sname pats // w/w/o case // w/w/o URL (if fetchable, if not mailto) // order of fname pats. //select best surname pattern with a hit in the page $spat = ''; while (! $spat) { if (!$surname) { $surname = $fname; $spatterns = $fpatterns; $fpatterns = $fname = ''; // Empty surname just a manoeuvre to get all db matches: // not needed beyond this point. } if (!$surname) break; foreach ($spatterns as $pat) { // search for $pat hits with URL $pat2 = pat2RE($pat); if (preg_match("/$pat2/s$case", $buf)) { $spat = $pat2; break;} } if (! $spat) { $index .= "No $surname name match at all.<BR>\n"; $surname = ''; }} if ($spat) $index .= "Surname pattern will be: $pat<BR>\n"; else { $index .= "No first or last name matched.<BR>\n"; $url = ''; break; // show contents of barren page. myexit(); } // ======== 2B1 Search for surname WITH url ========================== $upat1 = ';<(?i:A)[^>]*(?i:HREF)\s*=\s*(?:\'|"|)\s*((?i:http:)[^\'">]*'; // upto pat within URL $upat2 = ')(?:\'|"|)[^>]*>(.*?'; // upto pat within linktext $upat3 = ")(?=</?(?i:A));s$case"; // end of link. // so search will restart at this second <A or </A // ======== 2B1a Find hit inside url ========================== // This RE matches exactly what's wanted (apart from testing fetchability) $n = 0; if ($spat) $n = preg_match_all($upat1 . $spat . '[^">]*' . $upat2 . $upat3, $buf, $matches); if ($debug) $index .= "<BR>$n hits found inside URL<BR>\n"; if ($n) { for ($i = 0; $i < $n; $i++) { $a = $matches[0][$i]; $b = $matches[1][$i]; // urls $c = $matches[2][$i]; // linktexts $bb = mklink($b); if ($debug) $index .= "Hit inside URL: $bb $c <BR>\n"; if (fetchpage($b)) { $alls[] = $a; $urls[] = $b; $linktexts[] = $c; } }} // ======== 2B1b Find hit inside linktext ========================== // Append these to any found inside URL // This RE finds all links in the doc. (no overlap); // then post-test retains only those with a name match. $n = 0; if ($spat) { $n = preg_match_all($upat1 . $upat2 . "($spat|</a|</A).*?" . $upat3, $buf, $matches); //$buf, $matches, PREG_OFFSET_CAPTURE); if ($debug) $index .= "<BR>$n links found.<BR>\n"; } if ($n) { for ($i = 0; $i < $n; $i++) { $a = $matches[0][$i]; //$a = $matches[0][$i][0]; //$pos = $matches[0][$i][1]; // offset $b = $matches[1][$i]; // urls $c = $matches[2][$i]; // linktexts $d = $matches[3][$i]; // spat OR </A if ((strpos($d, '<' ) === false ) && fetchpage($b)) { // no good if </A match, not $spat $alls[] = $a; $urls[] = $b; $bb = mklink($b); $linktexts[] = $c; if ($debug) $index .= "$pos: Hit inside linktext: $bb $c <BR>\n"; }} } // ======== 2B1c Re-search and extend to line ends ========================== // Loop over alls[]; extend "alls" to include line ends. // This RE finds first linebrak, then first (distant) re-found link match // (and trailing linebrak). // Post-process then deletes all but last, nearest leading linebrak. // Loop extend matches to lines. // Need to restart 'line' not including terminator $count = count($alls); if ($debug) $index .= "<BR>$count matches found within links.<BR>\n"; $pos = 0; for ($i = 0; $i < $count; $i++) { $pat = preg_quote($alls[$i], ';'); //echo "Pos: $pos<BR>\n"; if (preg_match(";$linepat(.*?$pat.*?)(?=$linepat);s$case", //$buf, $matches)) { $buf, $matches, PREG_OFFSET_CAPTURE, $pos)) { // This gets first linepat in buf, then match; // code next deletes all linepats before match $a = $matches[0][0]; $n = $matches[0][1]; // pos of start of whole match $pos = $n + strlen($a); $a = preg_replace(";.*$linepat;s", '', $a); $alls[$i] = $a; }} if ($count == 1) { $url = $urls[0]; $linktext = $linktexts[0]; continue; } // ======== 2B2 Search for surname outwith a link ========================== // This RE finds first linebrak, then first (distant) name match // (and trailing linebrak). // Post-process then deletes all but last, nearest leading linebrak. if ($count == 0 && $spat) { if ($debug) $index .= "Looking for match outside links<BR>\n"; if (preg_match_all(";$linepat(.*?$spat.*?)(?=$linepat);s$case", $buf, $matches)) { // mustn't have $linepat at end of my target?? $a = $matches[0]; $a = preg_replace(";.*($linepat);s", '$1', $a); $alls = $a; // Replace by $1 and so preserve line start for anchor // Is this doing array replacments? YES. } } $count = count($alls); if ($debug) $index .= "<BR>$count extended line matches found.<BR>\n"; if ($count == 0) { $index .= "Error?: No match at all in Surname loop.<BR>\n"; // but may have no 1st or 2nd name matches on page at all. // but again: $pat loop should detect and report this } $index .= "<BR><BR>Lines with surname match; w/w/o link; w/w/o fname:<BR>\n"; foreach ($alls as $k => $line) { $index .= prline($line, $k); } // BREAK FROM LOOP IF 0 OR 1 SURNAME MATCHES if ($count < 2) { // from big loop: go print current buf $linefortag = $alls[0]; break; } // $alls has the line in: use for TAG // if only 1 but with URL then continue (loop jumpout) done above // ========2C Search for fname ========================== // Only search if is a fname; and more than 1 match from surname $found = 0; if ($fname) { //$index .= "<BR><BR>Fname matches:<BR>\n"; foreach ($fpatterns as $pat) { if ($found) break; for ($i = 0; $i < $count; $i++) { if ($found) break; $line = $alls[$i]; //if match that's it: anywhere on the line is as good? if (preg_match($pat, $line)) { $found = 1; $index .= 'Fname match found: '; $index .= prline($line ); break; } } } } if (! $found) $i = 0; // pick first one on page if ($count > 0) { $url = $urls[$i]; $linefortag = $alls[$i]; $linktext = $linktexts[$i]; }; // LOOP phase 3 ==== Finished searching in page. ========================== // So: if url, loop (buf will be overwritten) // if not, line will be used for TAG on 3B processing // if no hits, return this page but no TAG // test urls as the come in, but not into buf // LOOP phase 3A ==== Set up loop to new URL ========================== // set $url to new value; set linktext; and set tag marker // Already done in loop. }; // end the big while loop //if (! $url) break; // loop exit // ============================================================== // LOOP phase 3B ===== No new page: prepare to print current one. ===== // post-loop processing. // 3B0: exit if not worth sending page if ($flag) { $index .= "Flag requires exit not page print.<BR>\n"; myexit(); } if (! $buf) { $index .= "Empty buf: exit. <BR>\n"; myexit(); } // 3B1: find hits, insert anchor tag there. $a = '<A NAME="' . $anchor . '"></A>'; if ($linefortag) { $buf = str_replace($linefortag, $a . $linefortag, $buf); } // 3B2: Create index webpage file. $bufi = $startindex; $bufi .= $index; $bufi .= $endindex; file_put_contents($logfile, $bufi); // file must (be created and) have write permission for server/everyone // 3B3: Insert menu of links at top of page after <BODY> mkmenu(); $buf = preg_replace('|<body[^>]*>|i', '$0' . $menu, $buf, 1); // 3B4: Disable java? No. //$buf = preg_replace('/<script/', '<qscript', $buf); // 3B5: Insert a java autoscroll / find command. No. // 3B6: HIGHLIGHT terms // Needs a) browser settings to allow changes to text style/colour // b) browser HTML4 to do css and span //$span = '<font color="red" ><B>'; $a = array(); if ($surname0) $a[$surname0] = '#FFFF00'; if ($fname0) $a[$fname0] = '#00FFFF'; //$a = array($surname0 => '#FFFF00', $fname0 => '#00FFFF'); $sp0 = '<span style="background-color:TGT" >'; $sp2 = '</span>'; foreach ($a as $key => $val) { $sp1 = str_replace('TGT', $val, $sp0); $buf = preg_replace("/$key(?=[^<>]*<)(?![^<>]*<\/TITLE)/is", $sp1 . '$0' . $sp2, $buf); // should really suppress it within <HEAD> $sp1 = str_replace('00', '66', $sp1); $sp1 = str_replace('FF', 'AA', $sp1); $replace = '$1' . $sp1 . '$2' . $sp2; $buf = preg_replace("/($key" . '[^<>]*>)([^<]*)/is', $replace, $buf); }; // SEND IT =========== echo $buf; /* ============================================================== ====== WHAT'S THE POINT? AIMS / REQUIREMENTS 1) Solve a little problem that used to annoy me: retrieving someone's details page from UoG staff db. 2) Semi-general code for related problems. When working, it should deal with a) UoG staff db; b) LTC staff page Then perhaps try other staff lists; and DCS db. ====== THE PROBLEMS KNOWN PROBLEMS Univ. makes you do 2 not 1 operation to get at a person's page. Univ page is often not the best page Depts: hard to find their staff lists Dbs conceal such pages [check and document specific examples] REAL FUNCTION(S) 1) To dig out the best home page for a person. How good is google? Esp. if hidden by a db interface that crawlers don't cross. 2) To penetrate official pages with no std. labels leading to: a) People's pages b) People lists, staff lists c) To people-search boxes ================ MAIN COMP. TECHNIQUES == THE SOLUTIONs APPLIED 1. Human name mappings (from mynamematcher code) 1b Search on 2 or more words/names, not only on surname 2. Searching pages that google can't: dbs. Learning to drive them. 3. Follow a chain of 2 or more pages (punching through) (4. Domains: localise search to univ, depts, ... (5. Learning to find staff lists (what word patterns, markers?) and punching through to the person on such a list. ================ MAIN LOOP PLAN Search page for names If associated with a URL, then iterate/recurse to that page. Most normal exit: found a page with name hits but NOT in a new URL. What starting points? univ DB Google (restricted to UoG) Univ list of depts ================ Google Google is good for finding a person's home page in one, if not hidden by a database. Good vs. bad aspects: If exact name (not Rob <-> Bob or robert) If can fix it so it is read into PHP If can pick first of them. How to decide automatically if a good page found? One recipe to consider: UoGdb Person's dept. get db list of dept people Pick best one Use the name from that Google that name (but might say Robert not Bob?) --- Impl. to-dos: Get google to "lucky" jump to its one top ranked hit Get it read into PHP ? ====== Big features for the future Google $meta: fetch URLs of staff lists to use. In a subdomain e.g. at other HEI Form box for inputting search names, and so a new page (even if same addr) with lots of doc on this facility and how to search with it. Ways of generating 2 windows (best page, and trace index page) Return index page with link to BEST at top Return best page with link to trace page; Concat the 2 pages? Empty surname means concat all 26 1-letter searches in db Expand pattern rules in mkpatterns() ====== Menu: Original search call <url link> Original call or args, after names processing, before loop starts. ?Google option: when clicked, will run google on the names. "index" Page showing matches, links etc. <link> Original direct URL of unmodified version of the page below <link> ================ REs 1) Basically they find the first literal; then extend without search; then restart (in match-all) after end of whole match. 2) non-greedy just means shortest poss: not shortest poss to get overall match. 3) One approach for complex multi-brak matches is to to TD: match outer braks first ie. split on them; then loop over the array of matches, searching for inner braks. 4) Another is BU: match innermost first; then re-use the found match string as a literal string in re-search plus outer braks. (Used here for finding lines around <A link> It is important in the re-search loop to use pos. to restart search of next found item later in buf: there may be multiple different instances that are identical strings (but part of diff. lines). 5) Another is: match to either/or ($pat|$endbrak) then post-test that it is $pat. 6) Actual sol here: match for either/or, but assertion (not match) of trailing endbrak, so matching restarts to include it if .. A) Reasons for str-Replace() vs. preg_replace(). a) Comp. speed b) Saves having to have "/.../" braks in arg1 c) No need to escape the RE metachars (and deimiter /s) i.e. to use preg_quote. B) Can capture pos. along with matches. It is the pos in buf of the start of the matched string. C) Can capture content of lookahead and lookbehind BUT only if you have inner () inside them. And can capture pos. with them. ========= Smalls Haven't got urlencode() to work. Need? a version that doesn't munge the protocol and/or server. There are 4 issues a) spaces b) search chars: ?=&+ c) & instead of & d) { codes Get rel. URLs fully working. Untested: using spaces etc. within names given as search args. Can't get, in index trace print, "lines" to print on one line e.g. chem page ======= CODE JUNKYARD $linepat = preg_replace('/[A-Z]+/e', '$0 . "|" . strtolower($0)', $linepat); $lines = preg_split(';$linepat;s', $buf); for ($i = 1; $i < 10; $i++) $a[] = $i; foreach ($a as $v) echo "$v<BR>"; foreach ($a as $key => $val) { echo "$key => $val <BR>"; }; http://www.psy.gla.ac.uk/~steve/wscontents.html ==================== // 3b2: Insert a Java find/autoscroll $buf = preg_replace('/<script/', '<qscript', $buf); // disable Java if (! preg_match("/([<>])[^<>]*$pat/s$case", $buf, $matches)) { $findstr3 = '//'; $replace = ''; } elseif ($matches[1] == '<') { // inside a tag $findstr3 = ";\Q$matches[0]\E;s"; $replace = '<A NAME="' . $anchor . '"> </A>$0'; } else { // not inside a construct $findstr3 = $findstr2; $replace = '<A NAME="' . $anchor . '">$0</A>'; } ==================== //$linefortag = preg_quote($linefortag, '/'); //$buf2 = preg_replace("/$linefortag/s", $replace, $buf, 1); // test $buf = 'XabcdefABCDEF'; $pat = '/(ab)c/i'; $n = preg_match_all($pat, $buf, $matches, PREG_OFFSET_CAPTURE); // offset is start of matched pat. // get offsets for all caught subpatterns: everything becomes an array of str+offset print_r(parse_url($uogdb)); exit; //$s = http_head($url); Sends HEAD request to remote page server. // (isset($a) && strlen($a)) === isset($a{0}). function Xreturncode($url) { $a = get_headers($url); $c = preg_replace("/^[^ ]* /is", '', $a[0]); return $c; } function returncode($url) { $a = get_headers($url); $c = preg_replace("/^[^ ]* /is", '', $a[0]); return $c; } // Test HTTP response headers returned //foreach line from end to top, if location, change URL $a = $http_response_header; krsort($a); foreach ($a as $line) { if (preg_match('/^Location:/is', $line)) { $url = preg_replace("|^[^:]*:\s*|is", '', $line); break; } }; //foreach line from end to top, if http extract .. foreach ($a as $line) { if (preg_match('/^HTTP/is', $line)) { $s = preg_replace("|^[^ ]*\s*|is", '', $line); $n = preg_replace("| .*|is", '', $s); if ($n > 200) { $buf = ''; $index .= "HTTP Response: $s<BR>"; }; break; } }; echo "HTTP n=$n <BR>"; // $buf = preg_replace(';((?:HREF|SRC)\s*=\s*(?:\'|"|))([?+=&.a-z]+[^:a-z]);si', // '$1' . $urldir . '$2', $buf); // fixed above pattern to allow ? = & to start rel. URLs // prepend server.dir base to rel url // That pattern works. // but better would be to say not if matches http: // realise I need extra case: when rel. begins # or ?, then need filename as well as dirbase. // preg_match_all(';((?:HREF|SRC)\s*=\s*(?:\'|"|))(?![a-z]+:)([^"\']+);si', // $buf, $matches); // echo "$url<BR>"; myprintr($matches); */ ?>